@inproceedings{10.1145/3431379.3460640,
author = {Rajesh, Neeraj and Devarajan, Hariharan and Garcia, Jaime Cernuda and Bateman, Keith and Logan, Luke and Ye, Jie and Kougkas, Anthony and Sun, Xian-He},
title = {Apollo: An ML-Assisted Real-Time Storage Resource Observer},
year = {2020},
isbn = {9781450382175},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3431379.3460640},
doi = {10.1145/3431379.3460640},
abstract = {Applications and middleware services, such as data placement engines, I/O scheduling, and prefetching engines, require low-latency access to telemetry data in order to make optimal decisions. However, typical monitoring services store their telemetry data in a database in order to allow applications to query them, resulting in significant latency penalties. This work presents Apollo: a low-latency monitoring service that aims to provide applications and middleware libraries with direct access to relational telemetry data. Monitoring the system can create interference and overhead, slowing down raw performance of the resources for the job. However, having a current view of the system can aid middleware services in making more optimal decisions which can ultimately improve the overall performance. Apollo has been designed from the ground up to provide low latency, using Publish-Subscriber Pub-Sub semantics, and low overhead, using adaptive intervals in order to change the length of time between polling the resource for telemetry data and machine learning in order to predict changes to the telemetry data between actual resource polling. This work also provides some high level abstractions called I/O curators, which can further aid middleware libraries and applications to make optimal decisions. Evaluations showcase that Apollo can achieve sub-millisecond latency for acquiring complex insights with a memory overhead of ~57 MB and CPU overhead being only 7% more than existing state-of-the-art systems.},
booktitle = {Proceedings of the 30th International Symposium on High-Performance Parallel and Distributed Computing},
pages = {147–159},
numpages = {13},
keywords = {hpc cluster monitoring, resource monitoring, storage utilization, real-time monitoring, low latency monitoring, storage monitoring},
location = {Virtual Event, Sweden},
series = {HPDC '21}
}