logstash filter通过ruby进行正则匹配

时间:Feb. 26, 2019 分类:

目录:

环境介绍

日志格式

    log_format  main  '$remote_addr^A-^A$remote_user^A[$time_local]^A"$request"^A$status^A'
                      '$request_time^A$upstream_response_time^A$body_bytes_sent^A"$http_referer"^A'
                      '"$http_user_agent"^A"$http_x_forwarded_for"^A"$request_body"';

单条日志

10.30.8.243--[15/Nov/2017:10:50:37 +0800]"POST /Click/?e=ZTAxMWMxZjdAfEA3YzYwZmFjNjE1ZTllNTZiNjZlYTNiYmNkMmFhY2YxM0B8QDNiOGRjZmY5YzYxYzQ3MDNmZjUxN2QwMTMxYzlhNGI3QHxANDMyMjczMzdAfEAxMjEuMTQuMTQuOTBAfEAxNTEwNzEzODkxQHxAWEJAfEBicmFuZC1saXN0LWFsbC1uZXctZmVtYWxlLWZlbWFsZUB8QGJhbm5lcl9ob21lLXBhZ2VfZXZlbnRTY3JvbGxlZExpc3RAfEAyQHxAMjUzMTQyNEB8QFFEX2NwYTEyNEB8QFFEX2NwYTI1QHxAMEB8QFJfNWEwYmFhMjMwOTRkZEB8QHRhZ18w HTTP/1.1"2000.0050.00548"-""Dalvik/2.1.0 (Linux; U; Android 5.1; F100 Build/LMY47D) okhttp/3.4.1""121.14.14.90""misc=%7B%22source%22%3A%22%22%7D&timestamp=1510714238&client=%7B%22ageGroup%22%3A%22ALL%22%2C%22channel%22%3A%22QD_cpa25%22%2C%22deviceBrand%22%3A%22GIONEE+F100%22%2C%22deviceId%22%3A%227c60fac615e9e56b66ea3bbcd2aacf13%22%2C%22gender%22%3A%220%22%2C%22imei%22%3A%22862972030212309%22%2C%22net%22%3A%22wifi%22%2C%22packageName%22%3A%22com.culiukeji.huanletao%22%2C%22platform%22%3A%22android%22%2C%22screen%22%3A%221280*720%22%2C%22sdkVersion%22%3A%225.1%22%2C%22sessionId%22%3A%223b8dcff9c61c4703ff517d0131c9a4b7%22%2C%22shopToken%22%3A%22ccjb31535426282279000043227337af1535426282%22%2C%22userId%22%3A%2243227337%22%2C%22version%22%3A%223.13.110%22%2C%22xingeToken%22%3A%22725e882140347bce4f99b7654c939078acf8c8a9%22%7D&sign=5DC3DE0BEEEB667A0D161A27A079DA31"

shipper.conf

input {
    file {
        path => [ "/home/ec2-user/logs/nginx/api.chuchujie.com.access.log" ]
        sincedb_path => "/tmp/.ads_ads_ng-access-log.pos"
        start_position => "end"
        type => "ads_ads_ng-access-log"
    }

}

output{
    if [type] == "ads_ads_ng-access-log" {
        kafka {
            codec => plain {
                format => "%{host}%{message}"
            }
            bootstrap_servers => "internal-elk-kafka-1432427282.cn-north-1.elb.amazonaws.com.cn:80"
            topic_id => "ads_ads_ng-access-log"
            compression_type => "snappy" # string (optional), one of ["none", "gzip", "snappy"], default: "none"
        }
    }
}

indexer.conf

input {
    kafka {
        consumer_restart_on_error => true
        consumer_threads => 2
        group_id => "elk"
        topic_id => "ads_ads_ng-access-log"
        type => "ads_ads_ng-access-log"
        zk_connect => "internal-elk-kafka-1432427282.cn-north-1.elb.amazonaws.com.cn:2181"
    }
}
filter {
    if [type] == "ads_ads_ng-access-log" {
        ruby {
            init => "@kname = ['host','elb','ident','auth','timestamp','requestmethod','response','request_time','upstream_time','bytes','referrer','agent','xforwardedfor','request_body']"
            code => "
                new_event = LogStash::Event.new(Hash[@kname.zip(event['message'].split(''))])
                new_event.remove('@timestamp')
                event.append(new_event)"
            remove_field => [ "message" ]
            add_field => { "module" => "-" }
            add_field => { "function" => "-" }

        }
        if [requestmethod] {
            ruby{
                init=> "@kname = ['verb','request','httpversion']"
                code=> "event.append(Hash[@kname.zip(event['requestmethod'].split(' '))])"
            }
        }
        if [request] {
            ruby{
                init=> "@kname = ['request_path']"
                code=> "event.append(Hash[@kname.zip(event['request'].split('?'))])"
            }
        }
        ruby {
            code => "event['module'] = event['request_body'][/(module%22%3A%22)([0-9a-zA-Z_]+)/,2] if event['request_body'].class == String;"
        }
        ruby {
            code => "event['function'] = event['request_body'][/(function%22%3A%22)([0-9a-zA-Z_]+)/,2] if event['request_body'].class == String;"
        }
        ruby {
            code => "event['upstream_time'] = 0.0 if event['upstream_time'] == '-';"
        }
        mutate {
            replace => ["agent", ""]
            replace => ["path", ""]
            gsub => ["verb",'"','']
            gsub => ["httpversion",'"','']
        }
        mutate {
            remove_field => ["requestmethod"]
        }

    }
}

output {
    if [type] == "ads_ads_ng-access-log" {
        elasticsearch {
            action => "index"
            flush_size => 100
            hosts => ["internal-es-ads-507594234.cn-north-1.elb.amazonaws.com.cn"]
            idle_flush_time => 1
            index => "ads.access.log-%{+YYYY.MM.dd}"
            manage_template => true
            retry_max_interval => 2
            timeout => 2
            workers => 20
            template => "/home/ec2-user/op/op-logstash/conf/.template/ads-access.json"
            template_name => "ads-access"
            template_overwrite => true
        }
    }
}

字段详解

verb','request','httpversion'

requestmethod

对应

POST /Click/?e=ZTAxMWMxZjdAfEA3YzYwZmFjNjE1ZTllNTZiNjZlYTNiYmNkMmFhY2YxM0B8QDNiOGRjZmY5YzYxYzQ3MDNmZjUxN2QwMTMxYzlhNGI3QHxANDMyMjczMzdAfEAxMjEuMTQuMTQuOTBAfEAxNTEwNzEzODkxQHxAWEJAfEBicmFuZC1saXN0LWFsbC1uZXctZmVtYWxlLWZlbWFsZUB8QGJhbm5lcl9ob21lLXBhZ2VfZXZlbnRTY3JvbGxlZExpc3RAfEAyQHxAMjUzMTQyNEB8QFFEX2NwYTEyNEB8QFFEX2NwYTI1QHxAMEB8QFJfNWEwYmFhMjMwOTRkZEB8QHRhZ18w HTTP/1.1

对应filter代码

if [requestmethod] {
            ruby{
                init=> "@kname = ['verb','request','httpversion']"
                code=> "event.append(Hash[@kname.zip(event['requestmethod'].split(' '))])"
            }
        }

通过后端通过空格切分成三部分

request_path

request是刚切分出来的

对应

/Click/?e=ZTAxMWMxZjdAfEA3YzYwZmFjNjE1ZTllNTZiNjZlYTNiYmNkMmFhY2YxM0B8QDNiOGRjZmY5YzYxYzQ3MDNmZjUxN2QwMTMxYzlhNGI3QHxANDMyMjczMzdAfEAxMjEuMTQuMTQuOTBAfEAxNTEwNzEzODkxQHxAWEJAfEBicmFuZC1saXN0LWFsbC1uZXctZmVtYWxlLWZlbWFsZUB8QGJhbm5lcl9ob21lLXBhZ2VfZXZlbnRTY3JvbGxlZExpc3RAfEAyQHxAMjUzMTQyNEB8QFFEX2NwYTEyNEB8QFFEX2NwYTI1QHxAMEB8QFJfNWEwYmFhMjMwOTRkZEB8QHRhZ18w

对应filter代码

if [request] {
            ruby{
                init=> "@kname = ['request_path']"
                code=> "event.append(Hash[@kname.zip(event['request'].split('?'))])"
            }
        }

request_path为/Click/,后边的数据由于没有接收字段,就被舍弃了

后续request_body切分

misc=%7B%22source%22%3A%22%22%7D&timestamp=1510714238&client=%7B%22ageGroup%22%3A%22ALL%22%2C%22channel%22%3A%22QD_cpa25%22%2C%22deviceBrand%22%3A%22GIONEE+F100%22%2C%22deviceId%22%3A%227c60fac615e9e56b66ea3bbcd2aacf13%22%2C%22gender%22%3A%220%22%2C%22imei%22%3A%22862972030212309%22%2C%22net%22%3A%22wifi%22%2C%22packageName%22%3A%22com.culiukeji.huanletao%22%2C%22platform%22%3A%22android%22%2C%22screen%22%3A%221280*720%22%2C%22sdkVersion%22%3A%225.1%22%2C%22sessionId%22%3A%223b8dcff9c61c4703ff517d0131c9a4b7%22%2C%22shopToken%22%3A%22ccjb31535426282279000043227337af1535426282%22%2C%22userId%22%3A%2243227337%22%2C%22version%22%3A%223.13.110%22%2C%22xingeToken%22%3A%22725e882140347bce4f99b7654c939078acf8c8a9%22%7D&sign=5DC3DE0BEEEB667A0D161A27A079DA31

后续添加了

        ruby {
            code => "event['packageName'] = event['request_body'][/(packageName%22%3A%22)([0-9a-zA-Z.]+)/,2] if event['request_body'].class == String;"
        }
        ruby {
            code => "event['platform'] = event['request_body'][/(platform%22%3A%22)([0-9a-zA-Z_]+)/,2] if event['request_body'].class == String;"
        }
        ruby {
            code => "event['version'] = event['request_body'][/(version%22%3A%22)([0-9a-zA-Z.]+)/,2] if event['request_body'].class == String;"
        }
        ruby {
            code => "event['channel'] = event['request_body'][/(channel%22%3A%22)([0-9a-zA-Z_]+)/,2] if event['request_body'].class == String;"
        }
        ruby {
            code => "event['gender'] = event['request_body'][/(gender%22%3A%22)([0-9a-zA-Z]+)/,2] if event['request_body'].class == String;"
        }
        ruby {
            code => "event['net'] = event['request_body'][/(net%22%3A%22)([0-9a-zA-Z]+)/,2] if event['request_body'].class == String;"
        }

6个字段的正则匹配

取点每天上午9点的请求和logstash indexer机器的cpu空闲率

每天上午9点的请求

每天上午9点的ogstash indexer机器的cpu空闲率

总结

机型为aws的c3.large,即3.75 GiB 内存,2 个 vCPU,32 GB SSD 本地实例存储,64 位平台

昨天多正则取了6个字段,在9点请求量最多的时候cpu大概多使用了15%的样子,前两天空闲65%,今天是50%。

  • 上午9点的时候4W3的请求,cpu使用率由37%到52%,算下来6个正则,每1w请求多消耗3.48%的cpu;
  • 晚上9点半的时候3W4的请求,cpu使用率由31%到40%,算下来6个正则,每1w请求多消耗2.64%的cpu。