aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2020-02-19 10:49:11 +0200
committerSergey Poznyakoff <gray@gnu.org.ua>2020-02-19 10:49:11 +0200
commit998bea5d31b60f39c54f716063b44a5713fbc1c0 (patch)
tree12d8a50f05fbe2a0631dab0e9eedbb4828b54103
parentf6be3cd55490eab72be3e497877b5de5758d9911 (diff)
downloadping903-998bea5d31b60f39c54f716063b44a5713fbc1c0.tar.gz
ping903-998bea5d31b60f39c54f716063b44a5713fbc1c0.tar.bz2
Fix handling of missing replies.
* src/ping903.h (hostaddr): New member recv_tv. * src/ping903q.c (print_host_status): gracefully handle absense of the "stddev" attribute. Include IP address (or host name) to the nagios check output. * src/pinger.c: Get rid of the send queue. Send echo requests in batches each ping_interval. Correctly handle missing replies.
-rw-r--r--src/ping903.h1
-rw-r--r--src/ping903q.c26
-rw-r--r--src/pinger.c238
3 files changed, 122 insertions, 143 deletions
diff --git a/src/ping903.h b/src/ping903.h
index 5d8dd5b..52ed6c1 100644
--- a/src/ping903.h
+++ b/src/ping903.h
@@ -96,6 +96,7 @@ typedef struct hostaddr {
/* Current ping statistics */
struct timeval start_tv;
struct timeval xmit_tv;
+ struct timeval recv_tv;
unsigned long xmit_count;
unsigned long recv_count;
double tmin; /* minimum round trip time */
diff --git a/src/ping903q.c b/src/ping903q.c
index 53358b4..9e7a350 100644
--- a/src/ping903q.c
+++ b/src/ping903q.c
@@ -436,17 +436,18 @@ print_host_status(struct json_value *obj, void *unused)
json_number)->v.n;
double stop_ts = ejson_get(obj, "stop-timestamp",
json_number)->v.n;
-
+
printf("--- %s ping statistics ---\n", name);
printf("%lu packets transmitted, %lu received, %d%% packet loss, time %.0fms\n",
(unsigned long) xmit, (unsigned long) recv,
- (int)(100 * (xmit - recv) / recv),
+ (int)(100 * (xmit - recv) / xmit),
(stop_ts - start_ts) * 1000);
- printf("rtt min/avg/max/mdev = %.3f/%.3f/%.3f/%.3f ms\n",
- ejson_get(obj, "tmin", json_number)->v.n,
- ejson_get(obj, "avg", json_number)->v.n,
- ejson_get(obj, "tmax", json_number)->v.n,
- ejson_get(obj, "stddev", json_number)->v.n);
+ if (!json_object_get(obj, "stddev", &jv))
+ printf("rtt min/avg/max/mdev = %.3f/%.3f/%.3f/%.3f ms\n",
+ ejson_get(obj, "tmin", json_number)->v.n,
+ ejson_get(obj, "avg", json_number)->v.n,
+ ejson_get(obj, "tmax", json_number)->v.n,
+ ejson_get(obj, "stddev", json_number)->v.n);
}
return alive ? EX_NAGIOS_OK : EX_NAGIOS_CRITICAL;
}
@@ -536,6 +537,9 @@ match_host(char const *name)
size_t i, len;
n = snprintf(url, sizeof(url), "/match/%s", name);
+ if (n == -1 || n == sizeof(url)) {
+ abend("url buffer overflow");
+ }
http_query("GET", url, std_headers);
http_resp_init(&resp);
http_recv(&resp);
@@ -677,11 +681,11 @@ nagios_check(struct json_value *obj, void *data)
jv = ejson_get(obj, "status", json_string);
if (strcmp(jv->v.s, "init") == 0) {
- printf("PING UNKNOWN - waiting for data to arrive\n");
+ printf("PING %s UNKNOWN - waiting for data to arrive\n", name);
exit(EX_NAGIOS_UNKNOWN);
}
if (strcmp(jv->v.s, "invalid") == 0) {
- printf("PING CRITICAL - Packet loss = 100%%");
+ printf("PING %s CRITICAL - Packet loss = 100%%", name);
print_perfdata(cd->cth.round_trip, 100.0, cd);
exit(EX_NAGIOS_CRITICAL);
}
@@ -698,8 +702,8 @@ nagios_check(struct json_value *obj, void *data)
status = newstatus(status, EX_NAGIOS_CRITICAL);
if (loss >= cd->wth.loss_pct)
status = newstatus(status, EX_NAGIOS_WARNING);
- printf("PING %s Packet loss = %d%%, RTA = %.2f ms",
- status_str[status], (int)loss, rta);
+ printf("PING %s %s Packet loss = %d%%, RTA = %.2f ms",
+ name, status_str[status], (int)loss, rta);
print_perfdata(rta, loss, cd);
exit(status);
}
diff --git a/src/pinger.c b/src/pinger.c
index aef88d0..2aff036 100644
--- a/src/pinger.c
+++ b/src/pinger.c
@@ -47,10 +47,9 @@ size_t hostaddr_max;
static int ping_fd;
-static HOSTADDR *sendq_head, *sendq_tail;
static pthread_mutex_t sendq_mutex = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t sendq_update_cond = PTHREAD_COND_INITIALIZER;
-static pthread_cond_t sendq_empty_cond = PTHREAD_COND_INITIALIZER;
+static pthread_cond_t sendq_cond = PTHREAD_COND_INITIALIZER;
+static int send_p;
#define ICMP_HEADER_LEN (offsetof(struct icmp, icmp_data))
#define PING_HEADER_LEN (ICMP_HEADER_LEN+sizeof(struct timeval))
@@ -132,69 +131,6 @@ p903_init(void)
if (!seqidx)
emalloc_die();
}
-
-static int
-sendq_wait_empty(struct timeval *tv)
-{
- int ret = 0;
- struct timeval t1, t2;
- pthread_mutex_lock(&sendq_mutex);
- gettimeofday(&t1, NULL);
- while (sendq_head) {
- ret = 1;
- pthread_cond_wait(&sendq_empty_cond, &sendq_mutex);
- }
- gettimeofday(&t2, NULL);
- pthread_mutex_unlock(&sendq_mutex);
- if (ret)
- timersub(&t2, &t1, tv);
- return ret;
-}
-
-static void
-sendq_enqueue(HOSTADDR *addr, int delay)
-{
- struct timeval tv;
-
- pthread_mutex_lock(&sendq_mutex);
-
- gettimeofday(&tv, NULL);
- tv.tv_sec += delay;
- addr->xmit_tv = tv;
- addr->next = NULL;
- if (sendq_tail)
- sendq_tail->next = addr;
- else
- sendq_head = addr;
- sendq_tail = addr;
- pthread_cond_broadcast(&sendq_update_cond);
- pthread_mutex_unlock(&sendq_mutex);
-}
-
-static HOSTADDR *
-sendq_dequeue(void)
-{
- HOSTADDR *host;
- struct timespec ts;
-
- pthread_mutex_lock(&sendq_mutex);
- while (!sendq_head) {
- pthread_cond_wait(&sendq_update_cond, &sendq_mutex);
- }
- host = sendq_head;
- sendq_head = host->next;
- if (sendq_head == NULL)
- sendq_tail = NULL;
- host->next = NULL;
- if (!sendq_head)
- pthread_cond_broadcast(&sendq_empty_cond);
- pthread_mutex_unlock(&sendq_mutex);
-
- ts.tv_sec = host->xmit_tv.tv_sec;
- ts.tv_nsec = host->xmit_tv.tv_usec * 1000;
- clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &ts, NULL);
- return host;
-}
static unsigned short
icmp_cksum(unsigned char * addr, int len)
@@ -291,7 +227,7 @@ send_echo(HOSTADDR *host, unsigned char *ping_buffer)
seqno = seqno_alloc(host, &host->xmit_tv);
if (seqno == -1) {
- sendq_enqueue(host, 0);
+ //FIXME
return;
}
@@ -301,9 +237,6 @@ send_echo(HOSTADDR *host, unsigned char *ping_buffer)
host->addr, host->addrlen);
if (n < 0) {
error("%s: sendto: %s", host->name, strerror(errno));
- if (errno == EINTR || errno == ENOBUFS) {
- sendq_enqueue(host, 0);
- }
} else {
if (host->xmit_count == 0)
host->start_tv = host->xmit_tv;
@@ -314,28 +247,54 @@ send_echo(HOSTADDR *host, unsigned char *ping_buffer)
}
}
+static void
+start_probe(void)
+{
+ pthread_mutex_lock(&sendq_mutex);
+ send_p = 1;
+ pthread_cond_broadcast(&sendq_cond);
+ pthread_mutex_unlock(&sendq_mutex);
+}
+
+static void host_stat_commit(HOSTADDR *host);
+
void *
p903_sender(void *p)
{
+ size_t i;
unsigned char *ping_buffer;
struct pollfd pfd;
int n;
+ unsigned send_count = 0;
pfd.fd = ping_fd;
pfd.events = POLLOUT;
ping_buffer = emalloc(sizeof(struct icmp) + data_length);
+ pthread_mutex_lock(&sendq_mutex);
while (1) {
- n = poll(&pfd, 1, 0);
- if (n == 1) {
- HOSTADDR *host = sendq_dequeue();
- send_echo(host, ping_buffer);
- } else if (n == -1) {
- fatal("poll: %s", strerror(errno));
- exit(1);
- }
+ if (!send_p) {
+ while (!send_p)
+ pthread_cond_wait(&sendq_cond, &sendq_mutex);
+ send_count = 0;
+ }
+ for (i = 0; i < hostaddr_count; i++) {
+ n = poll(&pfd, 1, 0);
+ if (n == 1) {
+ send_echo(&hostaddr[i], ping_buffer);
+ } else if (n == -1) {
+ fatal("poll: %s", strerror(errno));
+ exit(1);
+ }
+ }
+ send_count++;
+ if (send_count == ping_count)
+ send_p = 0;
+ else
+ sleep(ping_interval);
}
+ pthread_mutex_unlock(&sendq_mutex);
return NULL;
}
@@ -363,30 +322,50 @@ nsqrt(double a, double prec)
return x1;
}
+static inline int
+timeval_is_null(struct timeval const *tv)
+{
+ return tv->tv_sec == 0 && tv->tv_usec == 0;
+}
+
static void
-host_stat(HOSTADDR *host, struct timeval const *tv)
-{
- memcpy(&host->stat_last.start_tv, &host->start_tv, sizeof(host->stat_last.start_tv));
- memcpy(&host->stat_last.stop_tv, tv, sizeof(host->stat_last.stop_tv));
- host->stat_last.xmit_count = host->xmit_count;
- host->stat_last.recv_count = host->recv_count;
- host->stat_last.tmin = host->tmin;
- host->stat_last.tmax = host->tmax;
+hostaddr_extract_stat(HOSTADDR *host, struct host_stat *st)
+{
+ memcpy(&st->start_tv, &host->start_tv,
+ sizeof(host->stat_last.start_tv));
+ memcpy(&st->stop_tv,
+ timeval_is_null(&host->recv_tv)
+ ? &host->xmit_tv : &host->recv_tv,
+ sizeof(host->stat_last.stop_tv));
+ st->xmit_count = host->xmit_count;
+ st->recv_count = host->recv_count;
+ st->tmin = host->tmin;
+ st->tmax = host->tmax;
if (host->recv_count > 0) {
double total = host->recv_count; //FIXME: repeat count?
double avg = host->tsum / total;
double vari = host->tsumsq / total - avg * avg;
- host->stat_last.avg = avg;
- host->stat_last.stddev = nsqrt(vari, 0.0005);
+ st->avg = avg;
+ st->stddev = nsqrt(vari, 0.0005);
+ }
+}
+
+static void
+host_stat_commit(HOSTADDR *host)
+{
+ if (host->stat_last.status != HOST_STAT_VALID) {
+ hostaddr_extract_stat(host, &host->stat_last);
+ host->stat_last.status = HOST_STAT_VALID;
}
- host->stat_last.status = HOST_STAT_VALID;
}
/* Reset runtime statistics counters */
static void
host_reset(HOSTADDR *host)
{
+ if (host->xmit_count > 0)
+ host_stat_commit(host);
host->xmit_count = 0;
host->recv_count = 0;
host->tmin = 999999999.0;
@@ -417,14 +396,16 @@ int
get_host_stat(HOSTADDR *host, struct json_value **retval)
{
struct json_value *obj, *v;
- struct host_stat const *st = &host->stat_last;
- double ts;
+ struct host_stat const *st;
+ struct host_stat stbuf;
static char const *host_stat_status[] = {
"init",
"valid",
"pending",
"invalid"
};
+ int validity;
+ double ts;
if (!(obj = json_new_object()))
goto err;
@@ -432,21 +413,41 @@ get_host_stat(HOSTADDR *host, struct json_value **retval)
if (!(v = json_new_string(host->name))
|| json_object_set(obj, "name", v))
goto err;
- if (!(v = json_new_bool(host_stat_is_valid(st)))
+
+ if (host_stat_is_valid(&host->stat_last)) {
+ st = &host->stat_last;
+ validity = 1;
+ } else if (host->stat_last.status == HOST_STAT_INIT
+ && host->xmit_count > ping_tolerance) {
+ hostaddr_extract_stat(host, &stbuf);
+ stbuf.status = HOST_STAT_PENDING;
+ st = &stbuf;
+ validity = 1;
+ } else {
+ st = NULL;
+ validity = 0;
+ }
+
+ if (!(v = json_new_bool(validity))
|| json_object_set(obj, "validity", v))
goto err;
- if (!(v = json_new_string(host_stat_status[st->status]))
+
+ if (!(v = json_new_string(host_stat_status[host->stat_last.status]))
|| json_object_set(obj, "status", v))
goto err;
ts = timeval_to_double(&host->xmit_tv);
if (!(v = json_new_number(ts))
|| json_object_set(obj, "xmit-timestamp", v))
goto err;
-
- if (host_stat_is_valid(st)) {
+
+ if (st) {
int is_alive = st->xmit_count - st->recv_count
< ping_tolerance;
+ if (!(v = json_new_string(host_stat_status[st->status]))
+ || json_object_set(obj, "status", v))
+ goto err;
+
ts = timeval_to_double(&st->start_tv);
if (!(v = json_new_number(ts))
|| json_object_set(obj, "start-timestamp", v))
@@ -484,6 +485,10 @@ get_host_stat(HOSTADDR *host, struct json_value **retval)
|| json_object_set(obj, "stddev", v))
goto err;
}
+ } else {
+ if (!(v = json_new_string(host_stat_status[host->stat_last.status]))
+ || json_object_set(obj, "status", v))
+ goto err;
}
*retval = obj;
@@ -685,16 +690,16 @@ p903_receiver(void *p)
host = hostaddr_from_seqno(icmp->icmp_seq);
if (host) {
- struct timeval tv_now, tv_orig, tv_diff, *tp;
+ struct timeval tv_orig, tv_diff, *tp;
double rtt;
- gettimeofday (&tv_now, NULL);
+ gettimeofday (&host->recv_tv, NULL);
tp = (struct timeval *) icmp->icmp_data;
/* Avoid unaligned data: */
memcpy(&tv_orig, tp, sizeof (tv_orig));
- timersub(&tv_now, &tv_orig, &tv_diff);
+ timersub(&host->recv_tv, &tv_orig, &tv_diff);
rtt = timeval_to_double(&tv_diff) * 1000.0;
host->tsum += rtt;
@@ -708,12 +713,8 @@ p903_receiver(void *p)
if (verbose)
log_echo((struct sockaddr *)&addr, addrlen, icmp, ip, n, rtt);
-
- if (host->recv_count < ping_count) {
- sendq_enqueue(host, ping_interval);
- } else {
- host_stat(host, &tv_now);
- }
+ if (host->recv_count == ping_count)
+ host_stat_commit(host);
}
}
}
@@ -723,39 +724,12 @@ p903_receiver(void *p)
void *
p903_scheduler(void *p)
{
- int update_round = 0;
- double d;
-
while (1) {
size_t i;
- struct timeval tv;
-
/* Reset all statistics */
for (i = 0; i < hostaddr_count; i++)
host_reset(hostaddr + i);
-
- /* Wait for pending probes to finish */
- if (sendq_wait_empty(&tv)) {
- if (update_round == 0) {
- info("scheduler: "
- "starting probe interval correction");
- d = 0;
- }
- d += timeval_to_double(&tv);
- update_round++;
- info("scheduler: average: %.3fs", d / update_round);
- if (update_round == UPDATE_MAX) {
- d /= update_round;
- probe_interval += 3*d/2;
- info("scheduler: "
- "finished probe interval correction: %lu",
- probe_interval);
- update_round = 0;
- }
- }
-
- for (i = 0; i < hostaddr_count; i++)
- sendq_enqueue(hostaddr + i, 0);
+ start_probe();
sleep(probe_interval);
}
}

Return to:

Send suggestions and report system problems to the System administrator.