
时间:Feb. 13, 2017 分类:




define host{
        use                     linux-server        #定义主机使用的模板,参见templates.cfg            
        host_name               why-203             #主机名,监控都要从这个配置文件查找主机名对应IP
        alias                   why-203-alias       #主机别名
        address              #被监控服务器IP
        check_command           check-host-alive    #检测主机存活,来自commands.cfg
        max_check_attempts      10                  #故障后,最大尝试次数
        check_interval          5                   #正常检查间隔,默认单位为分钟
        retry_interval          1                   #故障重试检查间隔,默认单位为分钟
        check_period            24x7                #检查周期24x7,参见timeperiods.cfg
        notification_interval   120                 #故障后两次报警的通知时间间隔,默认单位为分钟
        notification_period     workhours           #通知时间workhours,参见timeperiods.cfg
        notification_options    d,u,r               #主机状态通知选项,d-down宕机,u-unreacheable不可达,r-recovery恢复
        contact_groups          admins              #报警联系人组,在contacts.cfg


define service{
        use                             generic-service
        host_name                       why-203
        service_description             mem
        check_command                   check_nrpe!check_mem
        name                            generic-service 
        active_checks_enabled           1           
        passive_checks_enabled          1           
        parallelize_check               1           
        obsess_over_service             1           
        check_freshness                 0           
        notifications_enabled           1           
        event_handler_enabled           1           
        flap_detection_enabled          1           
        failure_prediction_enabled      1           
        process_perf_data               1           #PNP出图记录相关          
        retain_status_information       1           
        retain_nonstatus_information    1           
        is_volatile                     0           
        check_period                    24x7        #检查周期
        max_check_attempts              3           #故障后,最大尝试次数
        normal_check_interval           10          #正常状态检查时间间隔,每4分钟去检查一次是否正常,推荐2分钟
        retry_check_interval            2           #重试检查时间间隔,单位默认为分钟,推荐1分钟
        contact_groups                  admins      #要通知的用户组        
        notification_options            w,u,c,r     #要通知的服务状态选项,w-warning,u-unknown,c-critical,r-recovery
        notification_interval           60          #通知间隔,即60分钟通知一次,推荐30分钟
        notification_period             24x7        #通知的周期      
        register                        0           



define service{
        name                            why-nagios 
        active_checks_enabled           1       
        passive_checks_enabled          1                  
        parallelize_check               1       
        obsess_over_service             1                      
        check_freshness                 0                      
        notifications_enabled           1                      
        event_handler_enabled           1                      
        flap_detection_enabled          1                      
        failure_prediction_enabled      1                      
        process_perf_data               1                      
        retain_status_information       1                      
        retain_nonstatus_information    1                      
        is_volatile                     0                      
        check_period                    24x7                   
        max_check_attempts              2                     
        normal_check_interval           2                     
        retry_check_interval            1                     
        contact_groups                  admins                 
        notification_options            w,u,c,r                
        notification_interval           10                    
        notification_period             24x7                   
        register                        0   



[root@why-3 ~]# cat /usr/local/nagios/etc/objects/timeperiods.cfg 

# This defines a timeperiod where all times are valid for checks, 
# notifications, etc.  The classic "24x7" support nightmare. :-)
define timeperiod{
        timeperiod_name 24x7
        alias           24 Hours A Day, 7 Days A Week
        sunday          00:00-24:00
        monday          00:00-24:00
        tuesday         00:00-24:00
        wednesday       00:00-24:00
        thursday        00:00-24:00
        friday          00:00-24:00
        saturday        00:00-24:00

# 'workhours' timeperiod definition
define timeperiod{
    timeperiod_name workhours
    alias       Normal Work Hours
    monday      09:00-17:00
    tuesday     09:00-17:00
    wednesday   09:00-17:00
    thursday    09:00-17:00
    friday      09:00-17:00

# 'none' timeperiod definition
define timeperiod{
    timeperiod_name none
    alias       No Time Is A Good Time

# Some U.S. holidays
# Note: The timeranges for each holiday are meant to *exclude* the holidays from being
# treated as a valid time for notifications, etc.  You probably don't want your pager 
# going off on New Year's.  Although you're employer might... :-)
define timeperiod{
    name            us-holidays
        timeperiod_name         us-holidays
        alias                   U.S. Holidays

        january 1               00:00-00:00     ; New Years
        monday -1 may           00:00-00:00     ; Memorial Day (last Monday in May)
        july 4                  00:00-00:00     ; Independence Day
        monday 1 september      00:00-00:00     ; Labor Day (first Monday in September)
        thursday 4 november     00:00-00:00     ; Thanksgiving (4th Thursday in November)
        december 25             00:00-00:00     ; Christmas

# This defines a modified "24x7" timeperiod that covers every day of the
# year, except for U.S. holidays (defined in the timeperiod above).
define timeperiod{
        timeperiod_name 24x7_sans_holidays
        alias           24x7 Sans Holidays

    use     us-holidays     ; Get holiday exceptions from other timeperiod

        sunday          00:00-24:00
        monday          00:00-24:00
        tuesday         00:00-24:00
        wednesday       00:00-24:00
        thursday        00:00-24:00
        friday          00:00-24:00
        saturday        00:00-24:00



[root@why-3 ~]# cat /usr/local/nagios/etc/objects/contacts.cfg    

# Just one contact defined by default - the Nagios admin (that's you)
# This contact definition inherits a lot of default values from the 'generic-contact' 
# template which is defined elsewhere.

define contact{
        contact_name                    nagiosadmin     ; Short name of user
    use             generic-contact     ; Inherit default values from generic-contact template (defined above)
        alias                           Nagios Admin        ; Full name of user

        email                           nagios@localhost    ; <<***** CHANGE THIS TO YOUR EMAIL ADDRESS ******


# We only have one contact in this simple configuration file, so there is
# no need to create more than one contact group.

define contactgroup{
        contactgroup_name       admins
        alias                   Nagios Administrators
        members                 nagiosadmin


define contact{
        name                            generic-contact     ; The name of this contact template
        service_notification_period     24x7            ; service notifications can be sent anytime
        host_notification_period        24x7            ; host notifications can be sent anytime
        service_notification_options    w,u,c,r,f,s     ; send notifications for all service states, flapping events, and scheduled downtime events
        host_notification_options       d,u,r,f,s       ; send notifications for all host states, flapping events, and scheduled downtime events
        service_notification_commands   notify-service-by-email ; send service notifications via email
        host_notification_commands      notify-host-by-email    ; send host notifications via email
        register                        0               ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL CONTACT, JUST A TEMPLATE!



# 'notify-host-by-email' command definition
define command{
    command_name    notify-host-by-email
    command_line    /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n\nDate/Time: $LONGDATETIME$\n" | /bin/mail -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$

# 'notify-service-by-email' command definition
define command{
    command_name    notify-service-by-email
    command_line    /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$\n" | /bin/mail -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$





每一个Nagios查询一个服务的状态时,会产生一个子进程,并且它使用来自该命令的输出和退出代码来确定具体的状态。 可以识别 代表 | 状态码 | 状态码表示 ---|---|--- OK | 0 | 表示服务器正常地工作 WARNING | 1 | 表示服务器处于警告状态 CRITICAL | 2 | 表示服务处于紧急,严重状态 UNKNOWN | 3 | 表示服务处于未知状态 DEPENDENT | 4 | 不知道,不常用


[root@why-3 ~]# md5sum /etc/passwd > /etc/ps.md5
[root@why-3 ~]# md5sum -c /etc/ps.md5 
/etc/passwd: OK
[root@why-3 ~]# cd /usr/local/nagios/libexec/
[root@why-3 libexec]# vi check_passwd
[root@why-3 libexec]# cat check_passwd 
char=`md5sum -c /etc/ps.md5 2>/dev/null | grep OK | wc -l`
if [ $char -eq 1 ];then
echo "passwd is ok"
exit 0
else "passwd is changed"
exit 2
[root@why-3 libexec]# chmod +x check_passwd 
[root@why-3 libexec]# ./check_passwd 
passwd is ok
[root@why-3 libexec]# vi ../etc/nrpe.cfg
command[check_passwd]=/usr/local/nagios/libexec/check_passwd -w 6 -c 10
[root@why-3 libexec]# pkill nrpe 
[root@why-3 libexec]# /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
[root@why-3 libexec]# ./check_nrpe -H -c check_passwd 
passwd is ok
[root@why-3 libexec]# vi  ../etc/objects/service.cfg
define service {
        use             generic-service
        host_name       why-203
        service_description     passwd
        check_command   check_nrpe!check_passwd
[root@why-3 libexec]# /etc/init.d/nagios reload
Running configuration check...done.
Reloading nagios configuration...done


[root@why-3 ~]# htpasswd /usr/local/nagios/etc/htpasswd.users mabiao
New password: 
Re-type new password: 
Adding password for user mabiao
[root@why-3 ~]# vi /usr/local/nagios/etc/cgi.cfg

[root@why-3 ~]# vi /usr/local/nagios/etc/objects/contacts.cfg
define contact{
        contact_name                    mabiao                  ; Short name of user
        use                             generic-contact         ; Inherit default values from generic-contact template (defined above)
        alias                           Nagios readonly         ; Full name of user

        email                           nagios@localhost        ; <<***** CHANGE THIS TO YOUR EMAIL ADDRESS ******
[root@why-3 ~]# vi /usr/local/nagios/etc/objects/hosts.cfg
define host{
        use                     linux-server            ; Name of host template to use
                                                        ; This host definition will inherit all variables that are defined
                                                        ; in (or inherited by) the linux-server host template definition.
        host_name               why-202
        alias                   why-202-alias
        contacts                mabiao
[root@why-3 ~]# service nagios restart
Running configuration check...done.
Stopping nagios: .done.
Starting nagios: done.




[root@why-3 ~]# tar xf libart_lgpl-2.3.17.tar.gz 
[root@why-3 ~]# cd libart_lgpl-2.3.17
[root@why-3 libart_lgpl-2.3.17]# ./configure 
[root@why-3 libart_lgpl-2.3.17]# make
[root@why-3 libart_lgpl-2.3.17]# make install 
[root@why-3 libart_lgpl-2.3.17]# cp -r /usr/local/include/libart-2.0 /usr/include


[root@why-3 libart_lgpl-2.3.17]# cd ..
[root@why-3 ~]# tar xf rrdtool-1.2.14.tar.gz 
[root@why-3 ~]# cd rrdtool-1.2.14
[root@why-3 rrdtool-1.2.14]# ./configure --prefix=/usr/local/rrdtool --disable-python --disable-tcl

Config is DONE!

          With MMAP IO: yes
          Perl Modules: perl_piped perl_shared
           Perl Binary: /usr/bin/perl
          Perl Version: 5.10.1
          Perl Options: PREFIX=/usr/local/rrdtool LIB=/usr/local/rrdtool/lib/perl/5.10.1
    Build Tcl Bindings: no
 Build Python Bindings: no
          Build rrdcgi: yes
       Build librrd MT: yes

Type 'make' to compile the software and use 'make install' to 
install everything to: /usr/local/rrdtool.

       ... that wishlist is NO JOKE. If you find RRDtool useful
make me happy. Go to http://people.ee.ethz.ch/oetiker/wish and
place an order.

                               -- Tobi Oetiker <tobi@oetiker.ch>
[root@why-3 libart_lgpl-2.3.17]# cd ..
[root@why-3 ~]# tar xf rrdtool-1.2.14.tar.gz 
[root@why-3 ~]# cd rrdtool-1.2.14
[root@why-3 rrdtool-1.2.14]# ./configure --prefix=/usr/local/rrdtool --disable-python --disable-tcl
[root@why-3 rrdtool-1.2.14]# make
[root@why-3 rrdtool-1.2.14]# make install
[root@why-3 rrdtool-1.2.14]# ls -l /usr/local/rrdtool/bin/
total 116
-rwxr-xr-x 1 root root 55641 Feb  8 22:46 rrdcgi
-rwxr-xr-x 1 root root  6727 Feb  8 22:46 rrdtool
-rwxr-xr-x 1 root root 52635 Feb  8 22:46 rrdupdate


[root@why-3 rrdtool-1.2.14]# cd ..
[root@why-3 ~]# tar xf pnp-0.4.14.tar.gz 
[root@why-3 ~]# cd pnp-0.4.14
[root@why-3 pnp-0.4.14]# ./configure \
> --with-rrdtool=/usr/local/rrdtool/bin/rrdtool \
> --with-perfdata-dir=/usr/local/nagios/share/perfdata/
如果出现Perl Module Time::HiRes not available
yum install -y perl-Time-HiRes
[root@why-3 pnp-0.4.14]# make all
[root@why-3 pnp-0.4.14]# make install
[root@why-3 pnp-0.4.14]# make install-config
[root@why-3 pnp-0.4.14]# make install-init
[root@why-3 pnp-0.4.14]# ll /usr/local/nagios/libexec/ | grep process
-rwxr-xr-x 1 nagios nagios  31827 Feb  8 22:58 process_perfdata.pl
[root@why-3 pnp-0.4.14]# vi /usr/local/nagios/etc/nagios.cfg

# 'process-host-perfdata' command definition
define command{
        command_name    process-host-perfdata
        command_line    /usr/bin/printf "%b" "$LASTHOSTCHECK$\t$HOSTNAME$\t$HOSTSTATE$\t$HOSTATTEMPT$\t$HOSTSTATETYPE$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$\n" >> /usr/local/nagios/var/host-perfdata.out

# 'process-service-perfdata' command definition
define command{
        command_name    process-service-perfdata
# 'process-host-perfdata' command definition
define command{
        command_name    process-host-perfdata        command_line    /usr/local/nagios/libexec/process_perfdata.pl

# 'process-service-perfdata' command definition
define command{
        command_name    process-service-perfdata        command_line    /usr/local/nagios/libexec/process_perfdata.pl
[root@why-3 pnp-0.4.14]# service nagios checkconfig
[root@why-3 pnp-0.4.14]# service nagios reload

访问http:// 如果出现这个页面即配置成功

监控图像通过services.cfg或者templates.cfg中service定义process_perf_data值为1实现的 URL整合到nagios,主机图像通过在host.cfg或者templates.cfg中的host定义action_url为/nagios/pnp/index.php?host=$HOSTNAME$,服务图像通过在services.cfg或者templates.cfg中的service定义action_url为/nagios/pnp/index.php?host=$HOSTNAME$&$SERVICEDESC$ 可以看到一个趋势图的东西


[root@why-3 pnp-0.4.14]# ll /usr/local/nagios/share/perfdata/
total 8
drwxr-xr-x 2 nagios nagios 4096 Feb  8 23:53 why-202
drwxr-xr-x 2 nagios nagios 4096 Feb  8 23:49 why-203
[root@why-3 pnp-0.4.14]# ll /usr/local/nagios/share/perfdata/why-202/
total 4960
-rw-r--r-- 1 nagios nagios  384952 Feb  8 23:45 Disk_Partition.rrd
-rw-r--r-- 1 nagios nagios   11323 Feb  8 23:45 Disk_Partition.xml
-rw-r--r-- 1 nagios nagios 1918040 Feb  8 23:45 Disk_iostat.rrd
-rw-r--r-- 1 nagios nagios   13008 Feb  8 23:45 Disk_iostat.xml
-rw-r--r-- 1 nagios nagios  768224 Feb  8 23:53 blog_url.rrd
-rw-r--r-- 1 nagios nagios   11748 Feb  8 23:53 blog_url.xml
-rw-r--r-- 1 nagios nagios 1151496 Feb  8 23:45 load.rrd
-rw-r--r-- 1 nagios nagios   12183 Feb  8 23:45 load.xml
-rw-r--r-- 1 nagios nagios  384952 Feb  8 23:47 mem.rrd
-rw-r--r-- 1 nagios nagios   11225 Feb  8 23:47 mem.xml
-rw-r--r-- 1 nagios nagios  384952 Feb  8 23:48 swap_Partition.rrd
-rw-r--r-- 1 nagios nagios   11227 Feb  8 23:48 swap_Partition.xml


[root@why-3 libexec]# cat check_passwd 
char=`md5sum -c /etc/ps.md5 2>/dev/null | grep OK | wc -l`
if [ $char -eq 1 ];then
echo "passwd is ok|status=$char;0;0;;"
exit 0
else "passwd is changed|status=$char;0;0;;"
exit 2
[root@why-3 libexec]# ./check_passwd 
passwd is ok|status=1;0;0;;
[root@why-3 libexec]# pkill nrpe
[root@why-3 libexec]# /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d

然后就可以看到 上面输出的结果中 "|" 后面的值就可以提供给pnp rrdtool 生成数据图表,注意 “|” 后面的几个 “;” 每个“;” 前面都分别代表不同的意思 第一个就是当前的值,第二个是warning值,第三个是 临界值,第四个为最小值,第五个为最大值





[root@why-3 etc]# vi objects/contacts.cfg
define contact{
        contact_name                    nagiosadmin             ; Short name of user
        use                             generic-contact         ; Inherit default values from generic-contact template (defined above)
        alias                           Nagios Admin            ; Full name of user

        email                           18832869630@163.com
[root@why-3 etc]# cat objects/templates.cfg
define contact{
        name                            generic-contact     ; The name of this contact template
        service_notification_period     24x7            ; service notifications can be sent anytime
        host_notification_period        24x7            ; host notifications can be sent anytime
        service_notification_options    w,u,c,r,f,s     ; send notifications for all service states, flapping events, and scheduled downtime events
        host_notification_options       d,u,r,f,s       ; send notifications for all host states, flapping events, and scheduled downtime events
        service_notification_commands   notify-service-by-email ; send service notifications via email
        host_notification_commands      notify-host-by-email    ; send host notifications via email
        register                        0               ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL CONTACT, JUST A TEMPLATE!


[root@why-3 etc]# cat objects/commands.cfg
# 'notify-host-by-email' command definition
define command{
    command_name    notify-host-by-email
    command_line    /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n\nDate/Time: $LONGDATETIME$\n" | /bin/mail -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$

# 'notify-service-by-email' command definition
define command{
    command_name    notify-service-by-email
    command_line    /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$\n" | /bin/mail -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$


[root@why-3 etc]# tail -n 6 objects/service.cfg
define service {
        use             generic-service
        host_name       why-203
        service_description     passwd
        check_command   check_nrpe!check_passwd


4. 检查模板中是否调用

[root@why-3 etc]# cat objects/templates.cfg
define service{
        name                            generic-service         ; The 'name' of this service template
        active_checks_enabled           1                       ; Active service checks are enabled
        passive_checks_enabled          1                       ; Passive service checks are enabled/accepted
        parallelize_check               1                       ; Active service checks should be parallelized (disabling this can lead to major performance problems)
        obsess_over_service             1                       ; We should obsess over this service (if necessary)
        check_freshness                 0                       ; Default is to NOT check service 'freshness'
        notifications_enabled           1                       ; Service notifications are enabled
        event_handler_enabled           1                       ; Service event handler is enabled
        flap_detection_enabled          1                       ; Flap detection is enabled
        failure_prediction_enabled      1                       ; Failure prediction is enabled
        process_perf_data               1                       ; Process performance data
        retain_status_information       1                       ; Retain status information across program restarts
        retain_nonstatus_information    1                       ; Retain non-status information across program restarts
        is_volatile                     0                       ; The service is not volatile
        check_period                    24x7                    ; The service can be checked at any time of the day
        max_check_attempts              3                       ; Re-check the service up to 3 times in order to determine its final (hard) state
        normal_check_interval           10                      ; Check the service every 10 minutes under normal conditions
        retry_check_interval            2                       ; Re-check the service every two minutes until a hard state can be determined
        contact_groups                  admins                  ; Notifications get sent out to everyone in the 'admins' group
        notification_options            w,u,c,r                 ; Send notifications about warning, unknown, critical, and recovery events
        notification_interval           60                      ; Re-notify about service problems every hour
        notification_period             24x7                    ; Notifications can be sent out at any time
         register                        0                      ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!


[root@why-3 etc]# service nagios reload
Running configuration check...done.
Reloading nagios configuration...done
[root@why-3 etc]# useradd testmail

可以看到监控端,已经出现CRITICAL 接收到邮件










[root@why-2 ~]# yum install -y mailx
[root@why-2 ~]# vi /etc/mail.rc
set from=18832869630@163.com smtp=smtp.163.com
set smtp-auth-user=18832869630@163.com smtp-auth-password=why123456(授权密码)
set smtp-auth=login
[root@why-2 ~]# echo "this is test mail." |mail -s "test mail" 93216193@qq.com


