#!/usr/bin/perl

# check_diskio is a Nagios plugin to monitor the amount of disk
# I/O in sectors on Linux 2.6 and 2.4 systems
#
# See  the INSTALL file for installation instructions
#
# Copyright (c) 2007,2008 ETH Zurich.
#
# This module is free software; you can redistribute it and/or modify it
# under the terms of GNU general public license (gpl) version 3.
# See the LICENSE file for details.
#
# RCS information
# enable substitution with:
#   $ svn propset svn:keywords "Id Revision HeadURL Source Date"
#
#   $Id: check_diskio 1064 2009-06-09 05:56:58Z corti $
#   $Revision: 1064 $
#   $HeadURL: https://svn.id.ethz.ch/nagios_plugins/check_diskio/check_diskio $
#   $Date: 2009-06-09 07:56:58 +0200 (Tue, 09 Jun 2009) $

use 5.008;
use strict;
use warnings;
use Carp;

use version; our $VERSION = '3.0.3';

use Array::Unique;
use English qw(-no_match_vars);
use Getopt::Long;
use File::Slurp;
use List::MoreUtils qw(any);
use List::Util qw(first);
use List::MoreUtils qw(any);
use Nagios::Plugin::Getopt;
use Nagios::Plugin::Threshold;
use Nagios::Plugin;
use Number::Format qw(format_number);
use POSIX qw(uname);
use Readonly;

use Data::Dumper;

Readonly my $overflow => 2**32;

Readonly my $bits_per_byte      => 8;
Readonly my $bytes_per_kilobit  => 128;
Readonly my $bytes_per_sector   => 512;
Readonly my $bytes_per_kilobyte => 1_024;
Readonly my $bytes_per_megabit  => 131_072;
Readonly my $bytes_per_megabyte => 1_048_576;

# IMPORTANT: Nagios plugins could be executed using embedded perl in this case
#            the main routine would be executed as a subroutine and all the
#            declared subroutines would therefore be inner subroutines
#            This will cause all the global lexical variables not to stay shared
#            in the subroutines!
#
# All variables are therefore declared as package variables...
#
use vars qw(
  $format
  $plugin
  $threshold
  $options
  %tmp_files
  %diskio
  %diskio_w
  %diskio_r
  @stat_file
);

##############################################################################
# subroutines

##############################################################################
# Usage     : verbose("some message string", $optional_verbosity_level);
# Purpose   : write a message if the verbosity level is high enough
# Returns   : n/a
# Arguments : message : message string
#             level   : options verbosity level
# Throws    : n/a
# Comments  : n/a
# See also  : n/a
sub verbose {

    # arguments
    my $message = shift;
    my $level   = shift;

    if ( !defined $message ) {
        $plugin->nagios_exit( UNKNOWN,
            q{Internal error: not enough parameters for 'verbose'} );
    }

    if ( !defined $level ) {
        $level = 0;
    }

    if ( $level < $options->verbose() ) {
        print $message;
    }

    return;

}

##############################################################################
# Usage     : @output = exec_command( $command );
# Purpose   : executes the command and returns the output
# Returns   : array with the standard ouput
# Arguments : $command : the command to execute
# Throws    : n/a
# Comments  : n/a
# See also  : n/a
sub exec_command {

    my ($command) = @_;

    my @result;
    my $output;

    verbose "Executing '$command'\n", 1;

    my $pid = open $output, q{-|}, $command
      or $plugin->nagios_exit( UNKNOWN, "Cannot execute $command: $OS_ERROR" );

    while (<$output>) {
        chomp;
        push @result, $_;
    }

    if (  !( close $output )
        && ( $OS_ERROR != 0 ) )
    {

        # close to a piped open return false if the command with non-zero
        # status. In this case $! is set to 0
        $plugin->nagios_exit( UNKNOWN,
            "Error while closing pipe to whoami: $OS_ERROR" );

    }

    return @result;

}

##############################################################################
# Usage     : write_timer($tmp, $in, $out)
# Purpose   : writes the time and I/O data to the temporary file
# Returns   : n/a
# Arguments : $tmp    : temporary file name
#             $in     : input
#             $output : output
# Throws    : n/a
# Comments  : n/a
# See also  : n/a
sub write_timer {

    my $tmp  = shift;
    my $in   = shift;
    my $out  = shift;
    my $time = time;

    my $TMP_FH;

    verbose 'writing data: '
      . format_number( $in,   0, 0, ) . q{/}
      . format_number( $out,  0, 0, ) . q{ @ }
      . format_number( $time, 0, 0, )
      . "\n", 1;

    open $TMP_FH, q{>}, $tmp
      or $plugin->nagios_exit( UNKNOWN,
        "Cannot initialize timer ($tmp): $OS_ERROR" );
    print {$TMP_FH} $time . " $in $out\n";
    close $TMP_FH
      or $plugin->nagios_exit( UNKNOWN, "Cannot close timer: $OS_ERROR" );

    return;

}

##############################################################################
# Usage     : ($time, $in, $out) = read_timer()
# Purpose   : reads the time and I/O data from the temporary file
# Returns   : ($time, $in, $out) time difference, input and output data
# Arguments : n/a
# Throws    : n/a
# Comments  : n/a
# See also  : n/a
sub read_timer {

    my $tmp = shift;

    my $TMP_FH;
    my $time;
    my $in;
    my $out;
    my $diff;

    open $TMP_FH, q{<}, $tmp
      or $plugin->nagios_exit( UNKNOWN, "Cannot open timer ($tmp): $OS_ERROR" );

    while (<$TMP_FH>) {
        chomp;
        ( $time, $in, $out ) = split;
        $diff = time - $time;
    }

    close $TMP_FH
      or $plugin->nagios_exit( UNKNOWN, "Cannot close timer: $OS_ERROR" );

    verbose 'reading data: '
      . format_number( $in,   0, 0, ) . q{/}
      . format_number( $out,  0, 0, ) . q{ @ }
      . format_number( $time, 0, 0, )
      . ' (diff '
      . format_number( $diff, 0, 0, )
      . ")\n", 1;

    return ( $diff, $in, $out );

}

##############################################################################
# Usage     : $version = kernel_version();
# Purpose   : detects the Linux kernel version
# Returns   : kernel version (w/o minor number)
# Arguments : n/a
# Throws    : n/a
# Comments  : n/a
# See also  : n/a
sub kernel_version {

    my ( $sysname, $nodename, $release, $version, $machine ) = uname;
    if ( $release =~ /^2.4/mxs ) {
        return '2.4';
    }
    if ( $release =~ /^2.6/mxs ) {
        return '2.6';
    }

    $plugin->nagios_exit( UNKNOWN,
        "Error: unsupported kernel version ($release)" );

    return;

}

##############################################################################
# Usage     : read_26($device)
# Purpose   : reads kernel 2.6 disk stats
# Returns   : n/a
# Arguments : n/a
# Throws    : n/a
# Comments  : n/a
# See also  : n/a
sub read_26 {

    my $device = shift;

    foreach my $line (@stat_file) {

        # kernel version 2.6

# /proc/diskstats format
#
# major         Major number
# minor         Minor number
# name          Name
# reads         This is the total number of reads completed successfully.
# merged        Reads and writes which are adjacent to each other may be merged for
#               efficiency.  Thus two 4K reads may become one 8K read before it is
#               ultimately handed to the disk, and so it will be counted (and queued)
#               as only one I/O.  This field lets you know how often this was done.
# s_read        This is the total number of sectors read successfully.
# ms_read       This is the total number of milliseconds spent by all reads.
# writes        This is the total number of writes completed successfully.
# s_write       This is the total number of sectors written successfully.
# ms_write      This is the total number of milliseconds spent by all writes.
# ios           The only field that should go to zero. Incremented as requests are
#               given to appropriate request_queue_t and decremented as they finish.
# ms_io         This field is increases so long as field 9 is nonzero.
# ms_weighted   This field is incremented at each I/O start, I/O completion, I/O
#               merge, or read f these stats by the number of I/Os in progress
#               times the number of milliseconds spent doing I/O since the
#               last update of this field.  This can provide an easy measure of both
#               I/O completion time and the backlog that may be accumulating.

    # set $_ to line to be able to use the default split on whitespace *skipping
    # any leading whitespace*
        $_ = $line;
        my (
            $major,       $minor,   $name,     $reads,
            $merge_read,  $s_read,  $ms_read,  $writes,
            $merge_write, $s_write, $ms_write, $ios,
            $ms_io,       $ms_weighted
        ) = split;

        if ( $name eq $device ) {
            return ( $s_read, $s_write );
        }

    }

    $plugin->nagios_exit( UNKNOWN, "Device $device not found" );

    return

}

##############################################################################
# Usage     : read_24($device)
# Purpose   : reads kernel 2.4 disk stats
# Returns   : n/a
# Arguments : n/a
# Throws    : n/a
# Comments  : n/a
# See also  : n/a
sub read_24 {

    my $device = shift;

    foreach my $line (@stat_file) {

        # kernel version 2.4

        # /proc/partitions format
        #
        # major         Major number
        # minor         Minor number
        # name          Name
        # rio           Number of read IO requests completed
        # rmerge        Number of submitted read requests that were merged
        #               into existing requests.
        # rsect         Number of read IO sectors submitted
        # ruse          Total length of time all completed read requests have
        #               taken to date, in milliseconds
        # wio           Number of write IO requests completed
        # wmerge        Number of submitted write requests that were merged
        #               into existing requests.
        # wsect         Number of write IO sectors submitted
        # wuse          Total length of time all completed write requests have
        #               taken to date, in milliseconds
        # running       Instantaneous count of IOs currently in flight
        # use           How many milliseconds there has been at least one
        #               IO in flight
        # aveq          The sum of how long all requests have spent in flight,
        #               in milliseconds

        $_ = $line;
        my (
            $major,  $minor, $blocks,  $name, $rio,
            $rmerge, $rsect, $ruse,    $wio,  $wmerge,
            $wsect,  $wuse,  $running, $use,  $aveq,
        ) = split;

        if ( defined $name && $name ne 'name' ) {
            verbose "Information found for '$name'\n", 1;
        }

        if ( $name =~ /^ide.*bus([0-9]).*target([0-9]).*lun([0-9])/mxs ) {

            # we try to get the corresponding device

            my $bus    = $1;
            my $target = $2;
            my $lun    = $3;

            verbose "Mapping $name";

            verbose " to \n";
        }

        if ( $name eq $device ) {
            return ( $rsect, $wsect );
        }

    }

    return;

}

##############################################################################
# Usage     : whoami()
# Purpose   : retrieve the user runnging the process
# Returns   : username
# Arguments : n/a
# Throws    : n/a
# Comments  : n/a
# See also  : n/a
sub whoami {
    my $output;

    my $user;

    my $pid = open $output, q{-|}, 'whoami'
      or
      $plugin->nagios_exit( UNKNOWN, "Cannot determine the user: $OS_ERROR" );
    while (<$output>) {
        chomp;
        $user = $_;
        last;
    }
    if (  !( close $output )
        && ( $OS_ERROR != 0 ) )
    {

        # close to a piped open return false if the command with non-zero
        # status. In this case $! is set to 0
        $plugin->nagios_exit( UNKNOWN,
            "Error while closing pipe to whoami: $OS_ERROR" );

    }

    if ( !$user ) {
        $plugin->nagios_exit( UNKNOWN, 'Cannot determine the user' );
    }

    return $user;

}

##############################################################################
# main
#

################
# initialization
$format = q{s};
$plugin = Nagios::Plugin->new( shortname => 'CHECK_DISKIO' );

########################
# Command line arguments

my $usage = <<'EOT';
check_diskio --device=devicename --critical=critical --warning=warning
             [--help] [--reset] [--silent] [--ssize=size]
             [--verbose] [--version] [--uom=unit]
EOT

my $extra = <<'EOT';

Units of measurement:

unit                       description
---------------------------------------------
  
sector,sectors,s           sectors/s
bit,bits,b        bps      bits per second
byte,bytes,B      Bps      bytes per second
Kb                Kbps     Kbits per second
KB,K              KBbps    Kbytes per second
Mb                Mbps     Mbits per second
MB,M              MBps     Mbytes per second

EOT

$options = Nagios::Plugin::Getopt->new(
    usage   => $usage,
    extra   => $extra,
    version => $VERSION,
    url     => 'https://trac.id.ethz.ch/projects/nagios_plugins',
    blurb   => 'Monitor disk I/O',
);

$options->arg(
    spec     => 'device|d=s@',
    help     => 'device name(s) (or mount point(s))',
    required => 1,
);

$options->arg(
    spec     => 'critical|c=i',
    help     => 'critical number of sectors/s',
    required => 1,
);

$options->arg(
    spec     => 'warning|w=i',
    help     => 'number of sectors/s which generates a warning',
    required => 1,
);

$options->arg(
    spec     => 'reset|r',
    help     => 'reset the counter(s)',
    required => 0,
);

$options->arg(
    spec     => 'silent|s',
    help     => 'no warnings or critials are issued',
    required => 0,
);

$options->arg(
    spec     => 'ssize=i',
    help     => 'specify the sector size in bytes (default 512)',
    required => 0,
    default  => $bytes_per_sector,
);

$options->arg(
    spec     => 'uom=s',
    help     => 'unit of measurement',
    required => 0,
);

$options->getopts();

################################################################################
# Sanity checks

################
# kernel version

my $kversion = kernel_version();

my $proc_diskstats_available = ( -r '/proc/diskstats' );

#########################
# Disk stats (read files)

if ( $kversion eq '2.6' ) {
    if ( !-r '/proc/diskstats' && !-r '/proc/partitions' ) {
        $plugin->nagios_exit( UNKNOWN,
            '/proc/diskstats or /proc/partitions not readable' );
    }
}
else {
    if ( !-r '/proc/partitions' ) {
        $plugin->nagios_exit( UNKNOWN, '/proc/partitions not readable' );
    }
}

if ( -r '/proc/diskstats' ) {
    @stat_file = File::Slurp::read_file('/proc/diskstats');
}
else {
    @stat_file = File::Slurp::read_file('/proc/partitions');
}

#########
# Devices

my @devices;
tie @devices, 'Array::Unique';

for my $device ( @{ $options->device() } ) {

    # Detect device from mount point
    if ( !( $device =~ /^\/dev\//mxs ) ) {
        if ( $device ne q{/} && $device =~ /\/$/mxs ) {

            # remove trailing /
            $device =~ s/\/$//mxs;
        }

        my $MTAB_FH;
        my $mount_point = $device;

        undef $device;

        open $MTAB_FH, q{<}, '/etc/mtab'
          or
          $plugin->nagios_exit( UNKNOWN, "Cannot open /etc/mtab: $OS_ERROR" );
        while (<$MTAB_FH>) {
            if ( $_ =~ /(\/dev.*)\ +$mount_point\ +.*/mxs ) {
                $device = $1;
                last;
            }
        }
        close $MTAB_FH
          or $plugin->nagios_exit( UNKNOWN,
            "Error while closing /etc/mtab: $OS_ERROR" );

        if ( !$device ) {
            $plugin->nagios_exit( UNKNOWN,
                "Could not find a device for $mount_point" );
        }

        # device found: strip the partition number
        $device =~ s/p?[0-9]+$//mxs;

        verbose "Mount point $mount_point corresponds to device $device\n";

    }

    my $dev_name = $device;
    $dev_name =~ s/^\/dev\///mxs;

    if ( !any { /\s$dev_name\s/msx } @stat_file ) {

        # LVM?
        if ( whoami() eq 'root' ) {

            my @output;

            @output = exec_command("lvdisplay $device 2>&1 ");

            if ( any { /not\ found/mxs } @output ) {
                verbose "LVM volume not found\n";
            }
            else {

                my @volume_groups = grep { /VG\ Name/mxs } @output;
                if ( @volume_groups == 1 ) {

                    $volume_groups[0] =~ s{.*\ }{}mxs;

                    @output =
                      exec_command("vgdisplay -v $volume_groups[0] 2>&1 ");

                    if ( any { /not\ found/mxs } @output ) {
                        verbose "Cannot get info on $volume_groups[0]\n";
                    }
                    else {

                        my @pvs = grep { /PV\ Name/mxs } @output;

                        if ( @pvs < 1 ) {
                            verbose "No physical volumes found\n";
                        }
                        else {

                            # strip everything but the device name fro
                            # the output
                            for (@pvs) {
                                s{.*\s+/dev/([a-z]+).*}{/dev/$1}mxs;
                            }

                            push @devices, @pvs;

                        }

                    }

                }

            }

        }
        else {

            print {*STDERR} "Warning: cannot check LVM volumes if not root\n";

            verbose "Not root: skipping LVM check\n";

        }

    }
    else {

        push @devices, $device;

    }

}

# check if the devices exist

for my $device (@devices) {
    if ( !-e $device ) {
        $plugin->nagios_exit( UNKNOWN, "Device $device not found" );
    }
}

verbose "Checking: @devices\n";

#####
# UOM

my $multiplier;
my $UOM;

if ($options->uom()) {
   $format = $options->uom();
}

if ( $format eq 's' || $format =~ /^sector[s]$/mxs ) {

    #sectors
    $multiplier = 1;
    $UOM        = 'sectors/s';

}
elsif ( $format eq 'b' || $format =~ /^bit[s]$/mxs ) {

    # bits
    $multiplier = $options->ssize() * $bits_per_byte;
    $UOM        = 'bps';

}
elsif ( $format eq 'B' || $format =~ /^byte[s]$/mxs ) {

    # bytes
    $multiplier = $options->ssize();
    $UOM        = 'Bps';

}
elsif ( $format eq 'K' || $format eq 'KB' ) {

    # KB
    $multiplier = $options->ssize() / $bytes_per_kilobyte;
    $UOM        = 'KBps';

}
elsif ( $format eq 'Kb' ) {

    # Kb ( * 8 / 1024 )
    $multiplier = $options->ssize() / $bytes_per_kilobit;
    $UOM        = 'Kbps';

}
elsif ( $format eq 'M' || $format eq 'MB' ) {

    # MB ( / 1024 / 1024 )
    $multiplier = $options->ssize() / $bytes_per_megabyte;
    $UOM        = 'MBps';

}
elsif ( $format eq 'Mb' ) {

    # Mb ( * 8 / 1024 / 1024 )
    $multiplier = $options->ssize() / $bytes_per_megabit;
    $UOM        = 'Mbps';

}
else {
    $plugin->nagios_exit(
        UNKNOWN,
        "Uknown unit of measurement: $format"
    );
}

################
# Initialization

$threshold = Nagios::Plugin::Threshold->set_thresholds(
    warning  => $options->warning(),
    critical => $options->critical(),
);

my $tmp_file_prefix = '/tmp/check_diskio_status-' . whoami();

# strip /dev/
for (@devices) {
    s/^\/dev\///mxs;
}

for my $device (@devices) {

    # we need one temporary file per device
    my ( $controller, $disk ) = split /\//mxs, $device;
    if ( !$disk ) {
        $tmp_files{$device} = "$tmp_file_prefix-$controller";
    }
    else {
        $tmp_files{$device} = "$tmp_file_prefix--$disk";
    }

}

########################
# Check the proc entry

my $diff;
my $found = 0;
my $in;
my $out;
my $time;

my $s_read;
my $s_write;

my $TMP_FH;
my $status = OK;
my @status_lines;

for my $device (@devices) {

    if ( $kversion eq '2.6' && $proc_diskstats_available ) {
        ( $s_read, $s_write ) = read_26($device);
    }
    else {
        ( $s_read, $s_write ) = read_24($device);
    }

    verbose 'current data: '
      . format_number( $s_read,  0, 0 ) . q{/}
      . format_number( $s_write, 0, 0 )
      . "\n", 1;

    if ( !-f $tmp_files{$device} ) {
        verbose "temporary file not available resetting and waiting\n";
        write_timer( $tmp_files{$device}, $s_write, $s_read );
        sleep 1;
        if ( $kversion eq '2.6' && $proc_diskstats_available ) {
            ( $s_read, $s_write ) = read_26($device);
        }
        else {
            ( $s_read, $s_write ) = read_24($device);
        }
    }

    ( $diff, $in, $out ) = read_timer( $tmp_files{$device} );

    if ( $diff < 1 ) {

        # wait a little bit
        sleep 1;
        ( $diff, $in, $out ) = read_timer();
    }

    write_timer( $tmp_files{$device}, $s_write, $s_read );

    verbose "time difference: $diff\n", 1;

    if ( $diff == 0 ) {

        # round up
        $diff = 1;
    }

    # check for overflows (2^32 sectors)
    if ( $s_write < $in ) {

        # overflow
        $diskio_w{$device} = $overflow - $in + $s_write;
    }
    else {
        $diskio_w{$device} = $s_write - $in;
    }

    if ( $s_read < $out ) {

        # overflow
        $diskio_r{$device} = $overflow - $out + $s_read;
    }
    else {
        $diskio_r{$device} = $s_read - $out;
    }

    $diskio_w{$device} = int( $diskio_w{$device} / $diff );
    $diskio_r{$device} = int( $diskio_r{$device} / $diff );

    $diskio_w{$device} = $diskio_w{$device} * $multiplier;
    $diskio_r{$device} = $diskio_r{$device} * $multiplier;

    $diskio{$device} = $diskio_w{$device} + $diskio_r{$device};

    # no UOM according to the guidelines
    #   http://nagiosplug.sourceforge.net/developer-guidelines.html#AEN203

    my $suffix = q{};

    # we append the device name to the labels only if there is more
    # than one device to maintain backwards compatibility
    if ( @devices > 1 ) {
        $suffix = "_\U$device";
    }

    $plugin->add_perfdata(
        label     => "WRITE$suffix",
        value     => sprintf( '%.0f', $diskio_w{$device} ),
        uom       => q{},
        threshold => $threshold,
    );

    $plugin->add_perfdata(
        label     => "READ$suffix",
        value     => sprintf( '%.0f', $diskio_r{$device} ),
        uom       => q{},
        threshold => $threshold,
    );

    $plugin->add_perfdata(
        label     => "TOTAL$suffix",
        value     => sprintf( '%.0f', $diskio{$device} ),
        uom       => q{},
        threshold => $threshold,
    );

    my $device_status = $threshold->get_status( $diskio{$device} );
    if ( $device_status == CRITICAL ) {
        $status = CRITICAL;
    }
    elsif ( $device_status == WARNING && $status != CRITICAL ) {
        $status = WARNING;
    }

    push @status_lines,
      "$device " . sprintf( '%.0f', $diskio{$device} ) . " $UOM";

}

if ( $options->silent() ) {
    $plugin->nagios_exit( OK, ( join ', ', @devices ) . 'OK' );
}
else {
    $plugin->nagios_exit( $status, join ', ', @status_lines, );
}

1;
