Extending sFlow's counter polling to export system performance metrics go me thinking about ways to extend sampling into servers. This thinking was in part motivated by the following problem.
A large data center comprises a compute cluster and a large NAS array (NFS/CIFS). If all the servers in the cluster start accessing the same file in the NAS cluster this can become a bottleneck, dramatically reducing throughput. This type of problem can occur in a variety of data center services, http, DNS, memcache etc.
Extending sFlow's sampling mechanism to application layer transactions provides a scalable way to monitor these types of distributed services. An sFlow agent embedded in the application would randomly sample completed transactions. This minimally impacts application performance, adding a counter decrement and test to the critical path. When a sample is taken, key transaction metrics are captured (e.g.,service_direction (client/server), type (e.g. read, write, get, head,..), status (succeeded, failed, error code), path(file path, url, lun), bytes_in, bytes_out, duration). In addition to the transaction statistics, the socket information associated with the request (remote ip, remote port, local ip, local port) is also captured and exported along with the transaction metrics. The socket information lets you link the application view to the network topology and traffic data that sFlow in the switches provides.
You could apply this instrumentation at either, or both ends of a transaction. Getting back to the NAS example, implementing sFlow transaction monitoring in the NFS/CIFS storage array would provide the information needed to identify hotspots in real-time. For large web farms it would be easy to implement transaction sampling in the logging module of the web servers (for example creating an Apache logging module that exported sampled web requests as sFlow). Layer 4-7 switches, proxies and caches could also sample and export layer
The following sFlow structures are a first cut at defining layer 7 samples:
/* Application transaction sampling
/* Note: Transactions are sampled upon completion
enum status_value {
succeeded = 0,
generic_failure = 1,
outofmemory = 2,
timeout = 3,
notpermitted = 4
}
enum service_direction {
client = 1,
server = 2
}
/* Generic Application Transaction record */
/* Every Application Transaction sample must start with a generic transaction record */
/* opaque = flow_data; enterprise = 0; format = 2000 */
struct transaction {
service_direction direction; /* was this transaction observed by the server or the client */
unsigned int wait; /* time in microseconds that transaction was queued
before processing started */
unsigned int duration; /* time in microseconds from start of processing to completion */
status_value status; /* status of transaction */
unsigned hyper bytes_received; /* bytes received */
unsigned hyper bytes_send; /* bytes sent */
}
/* Extended socket information,
Must be filled in for all transactions associated with a network socket
Omit if transaction associated with non-network IPC */
/* IPv4 Socket */
/* opaque = flow_data; enterprise = 0; format = 2100 */
struct extended_socket_ipv4 {
unsigned int protocol; /* IP Protocol type
(for example, TCP = 6, UDP = 17) */
ip_v4 local_ip; /* local IP address */
ip_v4 remote_ip; /* remote IP address */
unsigned int local_port; /* TCP/UDP local port number or equivalent */
unsigned int remote_port; /* TCP/UDP remote port number of equivalent */
}
/* IPv6 Socket */
/* opaque = flow_data; enterprise = 0; format = 2101 */
struct extended_socket_ipv6 {
unsigned int protocol; /* IP Protocol type
(for example, TCP = 6, UDP = 17) */
ip_v6 local_ip; /* local IP address */
ip_v6 remote_ip; /* remote IP address */
unsigned int local_port; /* TCP/UDP local port number or equivalent */
unsigned int remote_port; /* TCP/UDP remote port number of equivalent */
}
/* Extended NFS transaction */
/* see RFC 3530 */
/* opaque = flow_data; enterprise = 0; format = 2001 */
struct extended_nfs_storage_transaction {
opaque<> path; /* canonical path to file or directory
associated with operation file handle
UTF8 encoded string */
unsigned int operation; /* NFS operation */
unsigned int status; /* NFS operation status - nfsstat4 */
}
/* Extended SCSI transaction */
/* opaque = flow_data; enterprise = 0; format = 2002 */
struct extended_scsi_storage_transaction {
unsigned int lun; /* LUN */
unsigned int operation; /* use maxint to encode unknown operation */
unsigned int status ; /* SCSI status code reporting result of operation */
}
/* Extended Web transaction */
/* opaque = flow_data; enterprise = 0; format = 2003 */
struct extended_http_transaction {
string<> url; /* The HTTP request-line (see RFC 2616) */
string<> host; /* The host field from the HTTP header */
string<> referer; /* The referer field from the HTTP header */
string<> useragent; /* The user agent from the HTTP header */
string<> user; /* The authenticated user */
unigned int status; /* Status code returned with response */
}
There has been some dialog on sharing metrics with Ganglia on the Ganglia developers mailing list:
http://www.mail-archive.com/ganglia-developers@lists.sourceforge.net/
The following is a first attempt at defining host performance metrics for sFlow:
/* Physical Server CPU */
/* opaque = counter_data; enterprise = 0; format = 2003 */
struct host_cpu {
float load_one; /* 1 minute load avg. */
float load_five; /* 5 minute load avg. */
float load_fifteen; /* 15 minute load avg. */
unsigned int proc_run; /* total number of running processes */
unsigned int proc_total; /* total number of processes */
unsigned int cpu_num; /* number of CPUs */
unsigned int cpu_speed; /* speed in MHz of CPU */
unsigned int uptime; /* seconds since last reboot */
unsigned int cpu_user; /* user time (ms) */
unsigned int cpu_nice; /* nice time (ms) */
unsigned int cpu_system; /* system time (ms) */
unsigned int cpu_idle; /* idle time (ms) */
unsigned int cpu_wio; /* time waiting for I/O to complete (ms) */
unsigned int cpu_intr; /* time servicing interrupts (ms) */
unsigned int cpu_sintr; /* time servicing soft interrupts (ms) */
unsigned int interrupts; /* interrupt count */
unsigned int context; /* context switch count */
}
/* Physical Server Memory */
/* opaque = counter_data; enterprise = 0; format = 2004 */
struct host_memory {
unsigned int mem_total; /* total kB */
unsigned int mem_free; /* free kB */
unsigned int mem_shared; /* shared kB */
unsigned int mem_buffers; /* buffers kB */
unsigned int mem_cached; /* cached kB */
unsigned int swap_total; /* swap total kB */
unsigned int swap_free; /* swap free kB */
unsigned int page_in; /* page in count */
unsigned int page_out; /* page out count */
unsigned int swap_in; /* swap in count */
unsigned int swap_out; /* swap out count */
}
/* Physical Server Disk I/O */
/* opaque = counter_data; enterprise = 0; format = 2005 */
struct host_disk_io {
unsigned int reads; /* reads issued */
unsigned int reads_merged; /* reads merged */
unsigned int sectors_read; /* sectors read */
unsigned int read_time; /* read time (ms) */
unsigned int writes; /* writes completed */
unsigned int writes_merged; /* writes merged */
unsigned int sectors_written; /* sectors written */
unsigned int write_time; /* write time (ms) */
}
/* Physical Server Network I/O */
/* opaque = counter_data; enterprise = 0; format = 2006 */
struct host_net_io {
unsigned hyper bytes_in; /* total bytes in */
unsigned int pkts_in; /* total packets in */
unsigned int errs_in; /* total errors in */
unsigned int drops_in; /* total drops in */
unsigned hyper bytes_out; /* total bytes out */
unsigned int packets_out; /* total packets out */
unsigned int errs_out; /* total errors out */
unsigned int drops_out; /* total drops out */
}
Received on Fri Mar 12 09:38:15 2010
This archive was generated by hypermail 2.1.8 : 03/12/10 PST