Managing fine-grained provenance is a critical requirement for data stream management systems (DSMS), not only to address complex applications that require diagnostic capabilities and assurance, but also for providing advanced functionality such as revision processing or query debugging. This paper introduces a novel approach that uses operator instrumentation, i.e., modifying the behavior of operators, to generate and propagate fine-grained provenance through several operators of a query network. In addition to applying this technique to compute provenance eagerly during query execution, we also study how to decouple provenance computation from query processing to reduce run-time overhead and avoid unnecessary provenance retrieval. Our proposals include computing a concise superset of the provenance (to allow lazily replaying a query and reconstruct its provenance) as well as lazy retrieval (to avoid unnecessary reconstruction of provenance). We develop stream-specific compression methods to reduce the computational and storage overhead of provenance generation and retrieval. Ariadne, our provenance-aware extension of the Borealis DSMS implements these techniques. Our experiments confirm that Ariadne manages provenance with minor overhead and clearly outperforms query rewrite, the current state-of-the-art.
@article{GE14, author = {Glavic, Boris and Esmaili, Kyumars Sheykh and Fischer, Peter M. and Tatbul, Nesime}, date-added = {2014-05-11 17:49:19 +0000}, date-modified = {2014-05-11 17:55:40 +0000}, journal = {Transactions on Internet Technology}, keywords = {Ariadne; Provenance}, number = {1}, pages = {7:1-7:26}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GE14.pdf}, projects = {Ariadne}, title = {Efficient Stream Provenance via Operator Instrumentation}, venueshort = {TOIT}, volume = {13}, year = {2014}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GE14.pdf} }