未验证 提交 f9096f50 编写于 作者: W wankai123 提交者: GitHub

Support k8s monitoring (#6479)

上级 96611394
......@@ -68,6 +68,8 @@ Release Notes.
* Optimize the self monitoring grafana dashboard.
* Enhance the export service.
* Add function `retagByK8sMeta` and opt type `K8sRetagType.Pod2Service` in MAL for k8s to relate pods and services.
* Using "service.istio.io/canonical-name" to replace "app" label to resolve Envoy ALS service name.
* Support k8s monitoring.
* Make the flushing metrics operation concurrent.
* Fix ALS K8SServiceRegistry didn't remove the correct entry.
* Using "service.istio.io/canonical-name" to replace "app" label to resolve Envoy ALS service name.
......
......@@ -132,6 +132,9 @@ to be the identification of the metric data.
|istio-controlplane| Metrics of Istio control panel | otel-oc-rules/istio-controlplane.yaml | Istio Control Panel -> OpenTelemetry Collector --OC format--> SkyWalking OAP Server |
|oap| Metrics of SkyWalking OAP server itself | otel-oc-rules/oap.yaml | SkyWalking OAP Server(SelfObservability) -> OpenTelemetry Collector --OC format--> SkyWalking OAP Server |
|vm| Metrics of VMs | otel-oc-rules/vm.yaml | Prometheus node-exporter(VMs) -> OpenTelemetry Collector --OC format--> SkyWalking OAP Server |
|k8s-cluster| Metrics of K8s cluster | otel-oc-rules/k8s-cluster.yaml | K8s kube-state-metrics -> OpenTelemetry Collector --OC format--> SkyWalking OAP Server |
|k8s-node| Metrics of K8s cluster | otel-oc-rules/k8s-node.yaml | cAdvisor & K8s kube-state-metrics -> OpenTelemetry Collector --OC format--> SkyWalking OAP Server |
|k8s-service| Metrics of K8s cluster | otel-oc-rules/k8s-service.yaml | cAdvisor & K8s kube-state-metrics -> OpenTelemetry Collector --OC format--> SkyWalking OAP Server |
## Meter receiver
......
......@@ -27,7 +27,6 @@ import org.apache.skywalking.oap.meter.analyzer.dsl.Sample;
import org.apache.skywalking.oap.meter.analyzer.k8s.K8sInfoRegistry;
public enum K8sRetagType implements Retag {
Pod2Service {
@Override
public Sample[] execute(final Sample[] ss,
......@@ -39,11 +38,12 @@ public enum K8sRetagType implements Retag {
String namespace = sample.getLabels().get(namespaceLabelName);
if (!Strings.isNullOrEmpty(podName) && !Strings.isNullOrEmpty(namespace)) {
String serviceName = K8sInfoRegistry.getInstance().findServiceName(namespace, podName);
if (!Strings.isNullOrEmpty(serviceName)) {
Map<String, String> labels = Maps.newHashMap(sample.getLabels());
labels.put(newLabelName, serviceName);
return sample.toBuilder().labels(ImmutableMap.copyOf(labels)).build();
if (Strings.isNullOrEmpty(serviceName)) {
serviceName = BLANK;
}
Map<String, String> labels = Maps.newHashMap(sample.getLabels());
labels.put(newLabelName, serviceName);
return sample.toBuilder().labels(ImmutableMap.copyOf(labels)).build();
}
return sample;
}).toArray(Sample[]::new);
......
......@@ -21,5 +21,6 @@ package org.apache.skywalking.oap.meter.analyzer.dsl.tagOpt;
import org.apache.skywalking.oap.meter.analyzer.dsl.Sample;
public interface Retag {
String BLANK = "";
Sample[] execute(Sample[] ss, String newLabelName, String existingLabelName, String namespaceLabelName);
}
......@@ -28,6 +28,7 @@ import java.util.Collection;
import java.util.Map;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.skywalking.oap.meter.analyzer.dsl.tagOpt.Retag;
import org.apache.skywalking.oap.meter.analyzer.k8s.K8sInfoRegistry;
import org.junit.Before;
import org.junit.Test;
......@@ -133,7 +134,7 @@ public class K8sTagTest {
.labels(
of(
"namespace", "default", "container", "my-nginx", "cpu", "total", "pod",
"my-nginx-5dc4865748-no-pod"
"my-nginx-5dc4865748-no-pod" , "service", Retag.BLANK
))
.value(2)
.build(),
......@@ -175,7 +176,7 @@ public class K8sTagTest {
.labels(
of(
"namespace", "default", "container", "my-nginx", "cpu", "total", "pod",
"my-nginx-5dc4865748-no-service"
"my-nginx-5dc4865748-no-service" , "service", Retag.BLANK
))
.value(2)
.build(),
......
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This will parse a textual representation of a duration. The formats
# accepted are based on the ISO-8601 duration format {@code PnDTnHnMn.nS}
# with days considered to be exactly 24 hours.
# <p>
# Examples:
# <pre>
# "PT20.345S" -- parses as "20.345 seconds"
# "PT15M" -- parses as "15 minutes" (where a minute is 60 seconds)
# "PT10H" -- parses as "10 hours" (where an hour is 3600 seconds)
# "P2D" -- parses as "2 days" (where a day is 24 hours or 86400 seconds)
# "P2DT3H4M" -- parses as "2 days, 3 hours and 4 minutes"
# "P-6H3M" -- parses as "-6 hours and +3 minutes"
# "-P6H3M" -- parses as "-6 hours and -3 minutes"
# "-P-6H+3M" -- parses as "+6 hours and -3 minutes"
# </pre>
expSuffix: tag({tags -> tags.cluster = 'k8s-cluster::' + tags.cluster}).service(['cluster'])
metricPrefix: k8s_cluster
metricsRules:
- name: cpu_cores
exp: (kube_node_status_capacity * 1000).tagEqual('resource' , 'cpu').sum(['cluster'])
- name: cpu_cores_allocatable
exp: (kube_node_status_allocatable * 1000).tagEqual('resource' , 'cpu').sum(['cluster'])
- name: cpu_cores_requests
exp: (kube_pod_container_resource_requests * 1000).tagEqual('resource' , 'cpu').sum(['cluster'])
- name: cpu_cores_limits
exp: (kube_pod_container_resource_limits * 1000).tagEqual('resource' , 'cpu').sum(['cluster'])
- name: memory_total
exp: kube_node_status_capacity.tagEqual('resource' , 'memory').sum(['cluster'])
- name: memory_allocatable
exp: kube_node_status_allocatable.tagEqual('resource' , 'memory').sum(['cluster'])
- name: memory_requests
exp: kube_pod_container_resource_requests.tagEqual('resource' , 'memory').sum(['cluster'])
- name: memory_limits
exp: kube_pod_container_resource_limits.tagEqual('resource' , 'memory').sum(['cluster'])
- name: storage_total
exp: kube_node_status_capacity.tagEqual('resource' , 'ephemeral_storage').sum(['cluster'])
- name: storage_allocatable
exp: kube_node_status_allocatable.tagEqual('resource' , 'ephemeral_storage').sum(['cluster'])
- name: node_total
exp: kube_node_info.sum(['cluster'])
- name: node_status
exp: kube_node_status_condition.valueEqual(1).tagMatch('status' , 'true|unknown').sum(['cluster' , 'node' ,'condition'])
- name: namespace_total
exp: kube_namespace_labels.sum(['cluster'])
- name: deployment_total
exp: kube_deployment_labels.sum(['cluster'])
- name: deployment_status
exp: kube_deployment_status_condition.valueEqual(1).tagMatch('condition' , 'Available').sum(['cluster' , 'deployment' ,'condition' , 'status']).tag({tags -> tags.remove('condition')})
- name: deployment_spec_replicas
exp: kube_deployment_spec_replicas.sum(['cluster' , 'deployment'])
- name: service_total
exp: kube_service_info.sum(['cluster'])
- name: service_pod_status
exp: kube_pod_status_phase.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').valueEqual(1).sum(['cluster' , 'service' , 'phase'])
- name: pod_total
exp: kube_pod_info.sum(['cluster'])
- name: pod_status_not_running
exp: kube_pod_status_phase.valueEqual(1).tagNotMatch('phase' , 'Running').sum(['cluster' , 'pod' , 'phase'])
- name: container_total
exp: kube_pod_container_info.sum(['cluster'])
- name: pod_status_waiting
exp: kube_pod_container_status_waiting_reason.valueEqual(1).sum(['cluster' , 'pod' , 'container' , 'reason'])
- name: pod_status_terminated
exp: kube_pod_container_status_terminated_reason.valueEqual(1).sum(['cluster' , 'pod' , 'container' , 'reason'])
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This will parse a textual representation of a duration. The formats
# accepted are based on the ISO-8601 duration format {@code PnDTnHnMn.nS}
# with days considered to be exactly 24 hours.
# <p>
# Examples:
# <pre>
# "PT20.345S" -- parses as "20.345 seconds"
# "PT15M" -- parses as "15 minutes" (where a minute is 60 seconds)
# "PT10H" -- parses as "10 hours" (where an hour is 3600 seconds)
# "P2D" -- parses as "2 days" (where a day is 24 hours or 86400 seconds)
# "P2DT3H4M" -- parses as "2 days, 3 hours and 4 minutes"
# "P-6H3M" -- parses as "-6 hours and +3 minutes"
# "-P6H3M" -- parses as "-6 hours and -3 minutes"
# "-P-6H+3M" -- parses as "+6 hours and -3 minutes"
# </pre>
expSuffix: tag({tags -> tags.cluster = 'k8s-cluster::' + tags.cluster}).instance(['cluster'] , ['node'])
metricPrefix: k8s_node
metricsRules:
- name: cpu_cores
exp: (kube_node_status_capacity * 1000).tagEqual('resource' , 'cpu').sum(['cluster' , 'node'])
- name: cpu_usage
exp: (container_cpu_usage_seconds_total * 1000).tagEqual('id' , '/').sum(['cluster' , 'node']).rate('PT1M')
- name: cpu_cores_allocatable
exp: (kube_node_status_allocatable * 1000).tagEqual('resource' , 'cpu').sum(['cluster' , 'node'])
- name: cpu_cores_requests
exp: (kube_pod_container_resource_requests * 1000).tagEqual('resource' , 'cpu').sum(['cluster' , 'node'])
- name: cpu_cores_limits
exp: (kube_pod_container_resource_limits * 1000).tagEqual('resource' , 'cpu').sum(['cluster' , 'node'])
- name: memory_total
exp: kube_node_status_capacity.tagEqual('resource' , 'memory').sum(['cluster' , 'node'])
- name: memory_allocatable
exp: kube_node_status_allocatable.tagEqual('resource' , 'memory').sum(['cluster' , 'node'])
- name: memory_requests
exp: kube_pod_container_resource_requests.tagEqual('resource' , 'memory').sum(['cluster' , 'node'])
- name: memory_limits
exp: kube_pod_container_resource_limits.tagEqual('resource' , 'memory').sum(['cluster' , 'node'])
- name: memory_usage
exp: container_memory_working_set_bytes.tagEqual('id' , '/').sum(['cluster' , 'node'])
- name: storage_total
exp: kube_node_status_capacity.tagEqual('resource' , 'ephemeral_storage').sum(['cluster' , 'node'])
- name: storage_allocatable
exp: kube_node_status_allocatable.tagEqual('resource' , 'ephemeral_storage').sum(['cluster' , 'node'])
- name: node_status
exp: kube_node_status_condition.valueEqual(1).tagMatch('status' , 'true|unknown').sum(['cluster' , 'node' ,'condition'])
- name: pod_total
exp: kube_pod_info.sum(['cluster' , 'node'])
- name: network_receive
exp: container_network_receive_bytes_total.sum(['cluster' , 'node']).irate()
- name: network_transmit
exp: container_network_transmit_bytes_total.sum(['cluster' , 'node']).irate()
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This will parse a textual representation of a duration. The formats
# accepted are based on the ISO-8601 duration format {@code PnDTnHnMn.nS}
# with days considered to be exactly 24 hours.
# <p>
# Examples:
# <pre>
# "PT20.345S" -- parses as "20.345 seconds"
# "PT15M" -- parses as "15 minutes" (where a minute is 60 seconds)
# "PT10H" -- parses as "10 hours" (where an hour is 3600 seconds)
# "P2D" -- parses as "2 days" (where a day is 24 hours or 86400 seconds)
# "P2DT3H4M" -- parses as "2 days, 3 hours and 4 minutes"
# "P-6H3M" -- parses as "-6 hours and +3 minutes"
# "-P6H3M" -- parses as "-6 hours and -3 minutes"
# "-P-6H+3M" -- parses as "+6 hours and -3 minutes"
# </pre>
expSuffix: tag({tags -> tags.cluster = 'k8s-cluster::' + tags.cluster}).endpoint(['cluster'] , ['service'])
metricPrefix: k8s_service
metricsRules:
- name: pod_total
exp: kube_pod_info.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service'])
- name: cpu_cores_requests
exp: (kube_pod_container_resource_requests * 1000).retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').tagEqual('resource' , 'cpu').sum(['cluster' , 'service'])
- name: cpu_cores_limits
exp: (kube_pod_container_resource_limits * 1000).retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').tagEqual('resource' , 'cpu').sum(['cluster' , 'service'])
- name: memory_requests
exp: kube_pod_container_resource_requests.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').tagEqual('resource' , 'memory').sum(['cluster' , 'service'])
- name: memory_limits
exp: kube_pod_container_resource_limits.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').tagEqual('resource' , 'memory').sum(['cluster' , 'service'])
- name: pod_status
exp: kube_pod_status_phase.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').valueEqual(1).sum(['cluster' , 'service' , 'pod' , 'phase'])
- name: pod_status_waiting
exp: kube_pod_container_status_waiting_reason.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').valueEqual(1).sum(['cluster' , 'service' , 'pod' , 'container' , 'reason'])
- name: pod_status_terminated
exp: kube_pod_container_status_terminated_reason.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').valueEqual(1).sum(['cluster' , 'service' , 'pod' , 'container' , 'reason'])
- name: pod_status_restarts_total
exp: kube_pod_container_status_restarts_total.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service' , 'pod'])
- name: pod_cpu_usage
exp: (container_cpu_usage_seconds_total * 1000).tagNotEqual('pod' , '').retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service' , 'pod']).rate('PT1M')
- name: pod_memory_usage
exp: container_memory_working_set_bytes.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service' , 'pod'])
- name: pod_network_receive
exp: container_network_receive_bytes_total.tagNotEqual('pod' , '').retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service' , 'pod']).irate()
- name: pod_network_transmit
exp: container_network_transmit_bytes_total.tagNotEqual('pod' , '').retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service' , 'pod']).irate()
- name: pod_fs_usage
exp: container_fs_usage_bytes.tagNotEqual('pod' , '').retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service' , 'pod'])
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册