coredns优化和监控
目录:
优化
pod内存限制
原来pod的内存限制
resources:
limits:
memory: 170Mi
requests:
cpu: 500m
memory: 70Mi
现在pod的内存限制
resources:
limits:
memory: 4000Mi
requests:
cpu: "1"
memory: 4000Mi
coredns配置调整
调整前
.:53 {
errors
health {
lameduck 5s
}
ready
kubernetes cluster.local in-addr.arpa ip6.arpa {
pods insecure
fallthrough in-addr.arpa ip6.arpa
ttl 30
}
prometheus :9153
forward . /etc/resolv.conf
cache 30
loop
reload
loadbalance
}
调整后
# 集群外的域
.:53 {
errors
health {
lameduck 5s
}
ready
prometheus :9153
forward . /etc/resolv.conf
# 集群外为nameagent,需要尽快更新
cache 30 {
# 缓存数目调大
success 10240
denial 51200
}
# ipv6直接返回不存在
template ANY AAAA {
rcode NXDOMAIN
}
loop
reload
loadbalance
}
# 集群内的域
cluster.local.:53 {
errors
health {
lameduck 5s
}
ready
prometheus :9153
kubernetes cluster.local in-addr.arpa ip6.arpa {
pods insecure
fallthrough in-addr.arpa ip6.arpa
ttl 120
}
prometheus :9153
forward . /etc/resolv.conf
# 集群内,基本不会变的clusterIP
cache 120 {
success 10240
denial 51200
}
template ANY AAAA {
rcode NXDOMAIN
}
loop
reload
loadbalance
}
resovler.conf文件
调整前
··· nameserver 10.112.0.10 search why.svc.cluster.local svc.cluster.local cluster.local whysdomain.com options ndots:5 ···
调整后
nameserver 10.112.0.10
search cluster.local
options ndots:5 single-request-reopen timeout:2
现在集群内要求统一使用<servicename>.<namespace>.svc
示例集群内
blog.why.svc调整前解析流程
- blog.why.svc.why.svc.cluster.local
- blog.why.svc.svc.cluster.local
- blog.why.svc.cluster.local
调整后解析流程
- blog.why.svc.cluster.local
示例集群外,但是是内网环境
test.test.svc调整前解析流程
- test.test.svc.why.svc.cluster.local
- test.test.svc.svc.cluster.local
- test.test.svc.cluster.local
- test.test.svc.whsydomain.com
test.test.svc调整后解析流程
- test.test.svc.cluster.local
- test.test.svc(使用dns解析自动补全whsydomain.com)
示例外网域名
www.baidu.com调整前解析流程
- www.baidu.com.why.svc.cluster.local
- www.baidu.com.svc.cluster.local
- www.baidu.com.cluster.local
- www.baidu.com.whsydomain.com
- www.baidu.com
www.baidu.com调整后解析流程
- www.baidu.com.cluster.local
- www.baidu.com
监控
groups:
- name: coredns-alert-rules
rules:
- record: Coredns_Request_Num
expr: sum by (instance,cluster) (rate(coredns_dns_request_count_total{k8s_app="kube-dns"}[75s]))
labels:
common: "Coredns请求数"
- record: Coredns_Request_Lt64ms_Percent
expr: sum by (instance,cluster) (rate(coredns_dns_request_duration_seconds_bucket{le="0.064"}[75s])) / sum by (instance,cluster) (rate(coredns_dns_request_duration_seconds_bucket{le="+Inf"}[75s]))*100
labels:
common: "Coredns请求耗时小于64ms的百分比"
- record: Coredns_Request_Gt64ms_Num
expr: sum by (instance,cluster) (rate(coredns_dns_request_duration_seconds_bucket{le="+Inf"}[75s])) - sum by (instance,cluster) (rate(coredns_dns_request_duration_seconds_bucket{le="0.064"}[75s]))
labels:
common: "Coredns请求耗时大于64ms的数量"
- record: Coredns_Forward_Request_Lt64ms_Percent
expr: sum by (instance,cluster) (rate(coredns_forward_request_duration_seconds_bucket{le="0.064"}[75s])) / sum by (instance,cluster) (rate(coredns_forward_request_duration_seconds_bucket{le="+Inf"}[75s]))*100
labels:
common: "Coredns请求转发耗时小于64ms的百分比"
- record: Coredns_Forward_Request_Gt64ms_Num
expr: sum by (instance,cluster) (rate(coredns_forward_request_duration_seconds_bucket{le="+Inf"}[75s])) - sum by (instance,cluster) (rate(coredns_forward_request_duration_seconds_bucket{le="0.064"}[75s]))
labels:
common: "Coredns请求转发耗时大于64ms的数量"
- record: Coredns_Open_Fds
expr: sum by (instance,cluster) (process_open_fds{k8s_app="kube-dns"})
labels:
common: "Coredns文件打开数"
- alert: Coredns_Request_Num
expr: Coredns_Request_Num > 10000
for: 2m
labels:
extend: '{"server": "容器基础监控", "for": "2m"}'
receiver: op-k8s
level: P1
annotations:
description: "{{ $labels.instance }} 集群Coredns解析请求数大于10000; 当前值为: {{ $value }}"