From b8175c413c2c6eb85d8e52cb5fade398ce61f46a Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Wed, 11 Mar 2020 16:01:32 +0200 Subject: Improve diagnostics of invalid replies. Revise log verbosity settings. --- doc/ping903.8 | 11 ++++- src/pinger.c | 137 ++++++++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 114 insertions(+), 34 deletions(-) diff --git a/doc/ping903.8 b/doc/ping903.8 index bde1f51..9349fca 100644 --- a/doc/ping903.8 +++ b/doc/ping903.8 @@ -13,7 +13,7 @@ .\" .\" You should have received a copy of the GNU General Public License .\" along with Ping903. If not, see . -.TH PING903 8 "March 10, 2020" "PING903" "System Administration" +.TH PING903 8 "March 11, 2020" "PING903" "System Administration" .SH NAME ping903 \- high-performance ICMP monitoring daemon .SH SYNOPSIS @@ -139,7 +139,14 @@ Don't start supervisor process. Print program version, copyright information, and exit. .TP .B \-v -Turn on additional logging. +Turn on additional logging. This option can be given several times to +request more verbose output. If given single \fB\-v\fR option, the +program prints at the end of each probe the total number of echo +requests sent and replies received. Two options (\fB\-vv\fR), enable +additional diagnostics of invalid echo replies. Three options enable +logging of each received echo reply, and four options enable verbose +logging of each echo request sent. Notice that three or more +\fB\-v\fR options can produce huge amount of logs. .SH BUGS Only IPv4 is currently supported. .SH SEE ALSO diff --git a/src/pinger.c b/src/pinger.c index 8bb1090..53fedfa 100644 --- a/src/pinger.c +++ b/src/pinger.c @@ -39,9 +39,13 @@ #include "json.h" #include "defs.h" +/* Time in seconds between two subsequent probes. */ unsigned long probe_interval = 60; +/* Time between two subsequent echo requests within the same probe. */ unsigned long ping_interval = 1; +/* Number of echo requests per probe */ unsigned long ping_count = 10; +/* Number of unanswered echo requests after which the host is declared dead. */ unsigned long ping_tolerance = 3; /* Initial value for the tmin member of struct hostping */ @@ -236,7 +240,6 @@ typedef enum update_type { static pthread_mutex_t update_mutex = PTHREAD_MUTEX_INITIALIZER; static int check_host(char const *name); static int update_add(UPDATE_TYPE t, void *data); -static void update_commit(void); void pinger_setup(void) @@ -965,8 +968,10 @@ static pthread_mutex_t sendq_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t sendq_cond = PTHREAD_COND_INITIALIZER; static int send_p; -static unsigned xmit_total; -static unsigned recv_total; +static unsigned long probe_num; /* Ordinal number of the current probe. */ +/* Totals for the current probe: */ +static unsigned xmit_total; /* Number of requests transmitted. */ +static unsigned recv_total; /* Number of replies received. */ #define ICMP_HEADER_LEN (offsetof(struct icmp, icmp_data)) #define PING_DATALEN (64 - ICMP_HEADER_LEN) @@ -974,25 +979,39 @@ static unsigned recv_total; size_t data_length = PING_DATALEN; static unsigned char *data_buffer; +/* Ping identifier (for the icmp_id member of struct icmp) */ static int ping_ident; +/* Constants for sequence number database management */ + +/* Max. time in seconds after which a sequence database entry can be reused. */ #define MAX_PING_TIMEOUT 10 + enum { - MAX_SEQNO = USHRT_MAX, - MOD_SEQNO = MAX_SEQNO + 1 + MAX_SEQNO = USHRT_MAX, /* Max. value for the sequence number */ + MOD_SEQNO = MAX_SEQNO + 1 /* Modulus for computing next sequence + number. */ }; +/* Sequence number index entry. */ struct seqidx { - HOSTPING *host; - struct timeval tv; - int ping_num; + HOSTPING *host; /* Associated host. */ + struct timeval tv; /* Time the echo was sent. */ + unsigned long probe_num; /* Number of the probe within which the + echo was sent. */ + int ping_num; /* Number of echo request within the probe. */ }; -static struct seqidx *seqidx; -static unsigned short next_seqno; +static struct seqidx *seqidx; /* Sequence number database. */ +static unsigned short next_seqno; /* Next sequence number. */ +/* Protect simultaneous access to seqidx. */ static pthread_mutex_t seqno_mutex = PTHREAD_MUTEX_INITIALIZER; +/* Allocate and return the sequence number for the given host and + * transmission time. + * Return negative value if the number cannot be allocated. + */ static int seqno_alloc(HOSTPING *host, struct timeval *tv) { @@ -1001,6 +1020,7 @@ seqno_alloc(HOSTPING *host, struct timeval *tv) if (tv->tv_sec - seqidx[n].tv.tv_sec > MAX_PING_TIMEOUT) { memcpy(&seqidx[n].tv, tv, sizeof(*tv)); seqidx[n].host = host; + seqidx[n].probe_num = probe_num; seqidx[n].ping_num = host->xmit_count; next_seqno = (n + 1) % MOD_SEQNO; return n; @@ -1011,6 +1031,76 @@ seqno_alloc(HOSTPING *host, struct timeval *tv) return -1; } +/* Check the validity of the echo reply SEQ. Return 0 if the reply is + * valid, -1 otherwise. + * This function is called when both the seqidx array and the HOSTPING + * structure associated with SEQ are locked. + */ +static int +check_reply(int seq) +{ + int n = seqidx[seq].ping_num; + HOSTPING *host = seqidx[seq].host; + + if (seqidx[seq].probe_num != probe_num) { + /* Case 1. + * A latecomer reply, which arrived after its probe round + * was committed (see hostping_commit). + */ + if (verbose > 1) + info("%s: reply for discarded echo request #%d, " + "seqno %d; probe_num=%lu, current=%lu", + host->name, n, seq, seqidx[seq].probe_num, + probe_num); + } else if (host->xmit_count == 0) { + /* Case 2. + * A reply came while no echo requests were transmitted yet. + */ + info("%s: stray reply #%d, seqno %d; probe_num=%lu", + host->name, n, seq, probe_num); + } else if (n >= 0 && n < ping_count) { + if (n > host->xmit_count) { + /* Case 3. + * Similar to 2, except that some echoes were sent. + */ + error("%s: phantom reply #%d, seqno %d; xmit_count=%lu", + host->name, n, seq, host->xmit_count); + } else if (++host->nreply[n] > 1) { + /* Case 4. + * Duplicate reply. + */ + host->dup_count++; + info("%s: duplicate reply for echo #%d, seqno %d", + host->name, n, seq); + } else if (host->recv_count == host->xmit_count) { + /* Case 5. + * Similar to 2 and 3. + * Each echo request was replied to, and yet another + * reply arrived, which is not a duplicate. + */ + error("%s: unexpected reply #%d, seqno %d; " + "xmit_count=recv_count=%lu", + host->name, n, seq, host->xmit_count); + } + /* Case 6. + * This is a valid reply. + */ + return 0; + } else { + /* Case 7. + * A reply with impossible echo request number. + * This one should not happen indeed. + */ + error("%s: reply for unregistered echo #%d, seqno %d", + host->name, n, seq); + } + return -1; +} + +/* Given an echo sequence number, return the locked HOSTPING structure + * associated with this echo request, or NULL if the reply is + * invalid. + */ static HOSTPING * hostping_from_seqno(int seq) { @@ -1019,29 +1109,11 @@ hostping_from_seqno(int seq) pthread_mutex_lock(&seqno_mutex); host = seqidx[seq].host; if (host) { - int n; - HOSTPING *orig = host; - hostping_lock(host); - n = seqidx[seq].ping_num; - if (n >= 0 && n < ping_count) { - if (++host->nreply[n] > 1) { - host->dup_count++; - info("%s: duplicate reply for echo #%d, seqno %d", - host->name, n, seq); - host = NULL; - } else if (host->recv_count == host->xmit_count) { - error("%s: unexpected reply #%d, seqno %d", - host->name, n, seq); - host = NULL; - } - } else { - error("%s: duplicate reply for unregistered echo #%d, seqno %d", - host->name, n, seq); + if (check_reply(seq)) { + hostping_unlock(host); host = NULL; } - if (!host) - hostping_unlock(orig); } else fatal("no host found for sequence number %d", seq); pthread_mutex_unlock(&seqno_mutex); @@ -1192,7 +1264,7 @@ send_echo(HOSTPING *host, unsigned char *ping_buffer) data_length - sizeof(host->xmit_tv)); buflen = ICMP_HEADER_LEN + data_length; - if (verbose > 2) + if (verbose > 3) info("sending %zu bytes to %s, icmp_seq=%d", buflen, host->name, seqno); @@ -1378,7 +1450,7 @@ p903_receiver(void *p) host->recv_count++; - if (verbose > 1) + if (verbose > 2) log_echo((struct sockaddr *)&addr, addrlen, icmp, ip, n, rtt); if (host->recv_count == ping_count) @@ -1480,6 +1552,7 @@ p903_scheduler(void *p) /* Commit updates */ p903_update_commit(); + probe_num++; send_p = 1; pthread_cond_broadcast(&sendq_cond); pthread_mutex_unlock(&sendq_mutex); -- cgit v1.2.1