@article {19485, title = {A study of unpredictability in fault-tolerant middleware}, journal = {Computer Networks}, volume = {57}, year = {2013}, month = {2013/02/26/}, pages = {682 - 698}, abstract = {In enterprise applications relying on fault-tolerant middleware, it is a common engineering practice to establish service-level agreements (SLAs) based on the 95th or the 99th percentiles of the latency, to allow a margin for unexpected variability. However, the extent of this unpredictability has not been studied systematically. We present an extensive empirical study of unpredictability in 16 distributed systems, ranging from simple transport protocols to fault-tolerant, middleware-based enterprise applications, and we show that the inherent unpredictability in the systems examined arises from at most 1\% of the remote invocations. In the normal, fault-free operating mode most remote invocations have a predictable end-to-end latency, but the maximum latency follows unpredictable trends and is comparable with the time needed to recover from a fault. The maximum latency is not influenced by the system{\textquoteright}s workload, cannot be regulated through configuration parameters and is not correlated with the system{\textquoteright}s resource consumption. The high-latency outliers (up to three orders of magnitude higher than the average latency) have multiple causes and may originate in any component of the system. However, after filtering out 1\% of the invocations with the highest recorded response-times, the latency becomes bounded with high statistical confidence (p \< 0.01). We have verified this result on different operating systems (Linux 2.4, Linux 2.6, Linux-rt, TimeSys), middleware platforms (CORBA and EJB), programming languages (C, C++ and Java), replication styles (active and warm passive) and applications (e-commerce and online gaming). Moreover, this phenomenon occurs at all the layers of middleware-based systems, from the communication protocols to the business logic.}, keywords = {Fault tolerance, latency, Middleware, Remote procedure call, Unpredictability}, isbn = {1389-1286}, url = {http://www.sciencedirect.com/science/article/pii/S1389128612003696}, author = {Tudor Dumitras and Narasimhan, Priya} }