Database systems use static analysis to determine upfront which data is needed for answering a query and use indexes and other physical design techniques to speed-up access to that data. However, for important classes of queries, e.g., HAVING and top-k queries, it is impossible to determine up-front what data is relevant. To overcome this limitation, we develop provenance-based data skipping (PBDS), a novel approach that generates provenance sketches to concisely encode what data is relevant for a query. Once a provenance sketch has been captured it is used to speed up subsequent queries. PBDS can exploit physical design artifacts such as indexes and zone maps.
@article{NL21, author = {Niu, Xing and Liu, Ziyu and Li, Pengyuan and Glavic, Boris and Gawlick, Dieter and Krishnaswamy, Vasudha and Liu, Zhen Hua and Porobic, Danica}, keywords = {Provenance, Data Skipping, Relevance-based Data Management}, title = {Provenance-based Data Skipping}, journal = {Proceedings of the VLDB Endowment}, projects = {Relevance-based Data Management}, pages = {451 - 464}, volume = {15}, issue = {3}, year = {2021}, doi = {10.14778/3494124.3494130}, venueshort = {{PVLDB}}, pdfurl = {https://vldb.org/pvldb/vol15/p451-niu.pdf}, longversionurl = {https://arxiv.org/pdf/2104.12815} }