2024
-
Towards an Objective Metric for Data Value Through Relevance
Proceedings of the 14th Conference on Innovative Data Systems (2024).@inproceedings{G24, author = {}, title = {Towards an Objective Metric for Data Value Through Relevance}, booktitle = {Proceedings of the 14th Conference on Innovative Data Systems}, year = {2024}, projects = {Provenance; Relevance-based Data Management}, venueshort = {CIDR}, toappear = true }
2023
-
Efficient Approximation of Certain and Possible Answers for Ranking and Window Queries over Uncertain Data
Su Feng, Boris Glavic and Oliver Kennedy
Proceedings of the VLDB Endowment. 16, 6 (2023) , 1346–1358.@article{LL22b, author = {Feng, Su and Glavic, Boris and Kennedy, Oliver}, keywords = {Uncertainty}, projects = {Uncertainty; UA-DB}, title = {Efficient Approximation of Certain and Possible Answers for Ranking and Window Queries over Uncertain Data}, journal = {Proceedings of the VLDB Endowment}, volume = {16}, number = {6}, pages = {1346 - 1358}, doi = {10.14778/3583140.3583151}, pdfurl = {https://www.vldb.org/pvldb/vol16/p1346-feng.pdf}, longversionurl = {https://arxiv.org/pdf/2302.08676}, year = {2023}, venueshort = {{PVLDB}} }
-
How-to-PhD - A Guide for Dummies
Boris Glavic
EDBT Ph.D. Workshop Keynote.@misc{B23, author = {Glavic, Boris}, title = {How-to-PhD - A Guide for Dummies}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/2023_EDBT_Phd_Workshop_Keynote.pdf}, howpublished = {EDBT Ph.D. Workshop Keynote}, year = {2023}, venueshort = {EDBT} }
-
Provenance, Relevance-based Data Management, and the Value of Data
Boris Glavic
Provenance Week Keynote.@misc{B23a, author = {Glavic, Boris}, title = {Provenance, Relevance-based Data Management, and the Value of Data}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/2023_PW_Keynote.pdf}, howpublished = {Provenance Week Keynote}, year = {2023}, venueshort = {Provenance Week} }
-
Overlay Spreadsheets
Oliver Kennedy, Boris Glavic and Michael Brachmann
Proceedings of the Workshop on Human-In-the-Loop Data Analytics, HILDA 2023, Seattle, WA, USA, 18 June 2023 (2023), pp. 4:1–4:7.@inproceedings{KG23, author = {Kennedy, Oliver and Glavic, Boris and Brachmann, Michael}, isworkshop = {true}, keywords = {Vizier; Spreadsheets}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/KG23.pdf}, projects = {Vizier}, slides = {https://odin.cse.buffalo.edu/talks/2023-06-18-HILDA.html}, doi = {10.1145/3597465.3605220}, booktitle = {Proceedings of the Workshop on Human-In-the-Loop Data Analytics, {HILDA} 2023, Seattle, WA, USA, 18 June 2023}, pages = {4:1--4:7}, publisher = {{ACM}}, title = {{Overlay Spreadsheets}}, venueshort = {HILDA}, year = {2023} }
-
Hybrid Query and Instance Explanations and Repairs
Seokki Lee, Boris Glavic, Adriane Chapman and Bertram Ludäscher
Companion Proceedings of the ACM Web Conference 2023, WWW 2023, Austin, TX, USA, 30 April 2023 - 4 May 2023 (2023), pp. 1559–1562.@inproceedings{LG23, author = {Lee, Seokki and Glavic, Boris and Chapman, Adriane and Lud\"ascher, Bertram}, editor = {Ding, Ying and Tang, Jie and Sequeda, Juan F. and Aroyo, Lora and Castillo, Carlos and Houben, Geert-Jan}, title = {Hybrid Query and Instance Explanations and Repairs}, booktitle = {Companion Proceedings of the {ACM} Web Conference 2023, {WWW} 2023, Austin, TX, USA, 30 April 2023 - 4 May 2023}, pages = {1559--1562}, publisher = {{ACM}}, year = {2023}, url = {https://doi.org/10.1145/3543873.3587565}, doi = {10.1145/3543873.3587565}, timestamp = {Wed, 17 May 2023 21:55:45 +0200}, biburl = {https://dblp.org/rec/conf/www/LeeGCL23.bib}, venueshort = {TaPP}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/LG23.pdf}, keywords = {Hybrid Explanations; Provenance; Why-not}, bibsource = {dblp computer science bibliography, https://dblp.org} }
-
Efficient Management of Uncertain Data
Su Feng
Illinois Institute of Technology.@phdthesis{F23, venueshort = {PhD Thesis}, author = {Feng, Su}, keywords = {Uncertainty; UA-DB}, month = may, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/F23.pdf}, projects = {GProM; Vizier; UA-DB}, school = {Illinois Institute of Technology}, title = {Efficient Management of Uncertain Data}, year = {2023} }
-
Self-tuning Database Operations by Assessing the Importance of Data
Boris Glavic, Pengyuan Li and Ziyu Liu
Technical Report #IIT/CS-DB-2023-01
Illinois Institute of Technology.@techreport{GL23, author = {Glavic, Boris and Li, Pengyuan and Liu, Ziyu}, title = {Self-tuning Database Operations by Assessing the Importance of Data}, institution = {Illinois Institute of Technology}, year = {2023}, number = {IIT/CS-DB-2023-01}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GL23.pdf}, projects = {Relevance-based Data Management}, keywords = {Provenance, Relevance-based Data Management}, venueshort = {Techreport} }
2022
-
Effect of Pre-processing Data on Fairness and Fairness Debugging Using Gopher
Mousam Sarkar
Illinois Institute of Technology.@mastersthesis{S23, author = {Sarkar, Mousam}, date-added = {2018-08-20 18:55:49 +0000}, date-modified = {2018-08-20 18:55:49 +0000}, keywords = {Explanations; Debugging; Fairness}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/S23.pdf}, school = {Illinois Institute of Technology}, title = {{Effect of Pre-processing Data on Fairness and Fairness Debugging Using Gopher}}, venueshort = {Master Thesis}, year = {2022} }
-
Generating Interpretable Data-Based Explanations for Fairness Debugging using Gopher
Jiongli Zhu, Romila Pradhan, Boris Glavic and Babak Salimi
Proceedings of the 48th International Conference on Management of Data (SIGMOD) (Demonstration Track) (2022), pp. 2433–2436.@inproceedings{ZP22, author = {Zhu, Jiongli and Pradhan, Romila and Glavic, Boris and Salimi, Babak}, booktitle = {Proceedings of the 48th International Conference on Management of Data ({SIGMOD}) (Demonstration Track)}, keywords = {Bias; Explanations}, pages = {2433--2436}, projects = {}, pdfurl = {https://dl.acm.org/doi/pdf/10.1145/3514221.3520170}, url = {https://doi.org/10.1145/3514221.3520170}, doi = {10.1145/3514221.3520170}, video = {https://youtu.be/0gmQFLwpoAI}, title = {Generating Interpretable Data-Based Explanations for Fairness Debugging using Gopher}, venueshort = {SIGMOD}, year = {2022} }
-
Efficient Answering of Historical What-if Queries
Felix Campbell, Bahareh Arab and Boris Glavic
Proceedings of the 48th International Conference on Management of Data (SIGMOD) (2022), pp. 1556–1569.@inproceedings{CA22, author = {Campbell, Felix and Arab, Bahareh and Glavic, Boris}, booktitle = {Proceedings of the 48th International Conference on Management of Data ({SIGMOD})}, keywords = {Reenactment; What-if; Uncertainty}, projects = {GProM; Reenactment}, title = {Efficient Answering of Historical What-if Queries}, pages = {1556--1569}, pdfurl = {https://dl.acm.org/doi/pdf/10.1145/3514221.3526138}, doi = {10.1145/3514221.3526138}, longversionurl = {https://arxiv.org/pdf/2203.12860}, video = {https://www.youtube.com/watch?v=6O0InOM-ZbI&t=2s}, venueshort = {SIGMOD}, year = {2022} }
-
Runtime Provenance Refinement for Notebooks
Nachiket Deo, Boris Glavic and Oliver Kennedy
Proceedings of the 14th International Workshop on the Theory and Practice of Provenance (2022), pp. 8:1–8:4.@inproceedings{DG22, author = {Deo, Nachiket and Glavic, Boris and Kennedy, Oliver}, booktitle = {Proceedings of the 14th International Workshop on the Theory and Practice of Provenance}, isworkshop = {true}, keywords = {Provenance; Notebooks}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/DG22.pdf}, projects = {Vizier}, video = {https://youtu.be/pNwj_Mt6gOQ}, pages = {8:1--8:4}, url = {https://doi.org/10.1145/3530800.3534535}, doi = {10.1145/3530800.3534535}, title = {Runtime Provenance Refinement for Notebooks}, venueshort = {TaPP}, year = {2022} }
-
CaJaDE: Explaining Query Results by Augmenting Provenance with Context
Chenjie Li, Juseung Lee, Zhengjie Miao, Boris Glavic and Sudeepa Roy
Proceedings of the VLDB Endowment (Demonstration Track). 15, 12 (2022) , 3594–3597.@article{LL22a, author = {Li, Chenjie and Lee, Juseung and Miao, Zhengjie and Glavic, Boris and Roy, Sudeepa}, keywords = {Provenance; Explanations}, title = {CaJaDE: Explaining Query Results by Augmenting Provenance with Context}, journal = {Proceedings of the VLDB Endowment (Demonstration Track)}, projects = {Explanations beyond Provenance}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/LL22.pdf}, volume = {15}, number = {12}, pages = {3594--3597}, doi = {10.14778/3554821.3554852}, year = {2022}, venueshort = {{PVLDB}} }
-
Oracle PBDS Experiments
Boris Glavic, Xing Niu, Pengyuan Li and Ziyu Liu
Technical Report #IIT/Cs-db-2022-01
Illinois Institute of Technology.@techreport{GN22, author = {Glavic, Boris and Niu, Xing and Li, Pengyuan and Liu, Ziyu}, title = {Oracle PBDS Experiments}, institution = {Illinois Institute of Technology}, year = {2022}, number = {IIT/Cs-db-2022-01}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GN22.pdf}, projects = {Relevance-based Data Management}, keywords = {Provenance, Relevance-based Data Management}, venueshort = {Techreport} }
-
The Right Tool for the Job: Data-Centric Workflows in Vizier
Oliver Kennedy, Boris Glavic, Juliana Freire and Mike Brachmann
IEEE Data Eng. Bull. 45, 3 (2022) , 129–144.@article{KG22, author = {Kennedy, Oliver and Glavic, Boris and Freire, Juliana and Brachmann, Mike}, title = {The Right Tool for the Job: Data-Centric Workflows in Vizier}, journal = {{IEEE} Data Eng. Bull.}, projects = {Vizier; UA-DB}, volume = {45}, number = {3}, pages = {129--144}, year = {2022}, pdfurl = {http://sites.computer.org/debull/A22sept/p129.pdf}, url = {http://sites.computer.org/debull/A22sept/p129.pdf}, bibsource = {dblp computer science bibliography, https://dblp.org}, biburl = {https://dblp.org/rec/journals/debu/KennedyGFB22.bib}, venueshort = {Data Eng. Bull.} }
-
Enhancing Explanation Generation in the CaJaDE system using Interactive User Feedback
Juseung Lee
Illinois Institute of Technology.@mastersthesis{L22, author = {Lee, Juseung}, date-added = {2018-08-20 18:55:49 +0000}, date-modified = {2018-08-20 18:55:49 +0000}, keywords = {Provenance; Explanations}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/L22.pdf}, projects = {Explanations beyond Provenance}, school = {Illinois Institute of Technology}, title = {{Enhancing Explanation Generation in the CaJaDE system using Interactive User Feedback}}, venueshort = {Master Thesis}, year = {2022} }
-
Interpretable Data-Based Explanations for Fairness Debugging
Babak Salimi, Romila Pradhan, Jiongli Zhu and Boris Glavic
Proceedings of the 48th International Conference on Management of Data (2022), pp. 247–261.@inproceedings{SP22, author = {Salimi, Babak and Pradhan, Romila and Zhu, Jiongli and Glavic, Boris}, booktitle = {Proceedings of the 48th International Conference on Management of Data}, pages = {247--261}, title = {Interpretable Data-Based Explanations for Fairness Debugging}, video = {https://youtu.be/wSDgmONVvF4}, url = {https://doi.org/10.1145/3514221.3517886}, pdfurl = {https://dl.acm.org/doi/pdf/10.1145/3514221.3517886}, doi = {10.1145/3514221.3517886}, keywords = {Explanations; Fairness}, longversionurl = {https://arxiv.org/pdf/2112.09745}, venueshort = {{SIGMOD}}, year = {2022} }
2021
-
Integrating Provenance Management and Query Optimization
Xing Niu
Illinois Institute of Technology.@phdthesis{N21, venueshort = {PhD Thesis}, author = {Niu, Xing}, keywords = {Provenance, Cost-based optimization, Data Skipping, Relevance-based Data Management}, month = dec, pdfurl = {https://media.proquest.com/media/hms/PFT/2/W8M8M?cit%3Aauth=Niu%2C+Xing&cit%3Atitle=Integrating+Provenance+Management+and+Query+Optimization&cit%3Apub=ProQuest+Dissertations+and+Theses&cit%3Avol=&cit%3Aiss=&cit%3Apg=&cit%3Adate=2021&ic=true&cit%3Aprod=ProQuest&_a=ChgyMDIyMDgzMDIwMjQxNzgzNDoxOTYzOTYSBTg0NTY5GgpPTkVfU0VBUkNIIg0yNC4xMzYuMjQuMjIyKgUxODc1MDIKMjYyMjkzMDc1MDoNRG9jdW1lbnRJbWFnZUIBMFIGT25saW5lWgJGVGIDUEZUagoyMDIxLzAxLzAxcgoyMDIxLzEyLzMxegCCASlQLTEwMDg3NTItMjgzNzctQ1VTVE9NRVItMTAwMDAyMDUtNDM1NTkyMpIBBk9ubGluZcoBVE1vemlsbGEvNS4wIChNYWNpbnRvc2g7IEludGVsIE1hYyBPUyBYIDEwLjE1OyBydjoxMDMuMCkgR2Vja28vMjAxMDAxMDEgRmlyZWZveC8xMDMuMNIBFkRpc3NlcnRhdGlvbnMgJiBUaGVzZXOaAgdQcmVQYWlkqgIrT1M6RU1TLU1lZGlhTGlua3NTZXJ2aWNlLWdldE1lZGlhVXJsRm9ySXRlbcoCE0Rpc3NlcnRhdGlvbi9UaGVzaXPSAgFZ8gIA%2BgIBToIDA1dlYooDHENJRDoyMDIyMDgzMDIwMjQxNzgzNDo3OTIwNDY%3D&_s=oFyeSaHKMuftg4s4wmEJRSOs%2B4U%3D}, projects = {GProM; Relevance-based Data Management}, school = {Illinois Institute of Technology}, title = {Integrating Provenance Management and Query Optimization}, year = {2021} }
-
Efficient Uncertainty Tracking for Complex Queries with Attribute-level Bounds
Su Feng, Aaron Huber, Boris Glavic and Oliver Kennedy
Proceedings of the 46th International Conference on Management of Data (2021), pp. 528–540.@inproceedings{FH21, author = {Feng, Su and Huber, Aaron and Glavic, Boris and Kennedy, Oliver}, booktitle = {Proceedings of the 46th International Conference on Management of Data}, keywords = {UA-DB; Vizier}, pages = {528 – 540}, doi = {10.1145/3448016.3452791}, pdfurl = {https://dl.acm.org/doi/pdf/10.1145/3448016.3452791}, projects = {Vizier; UA-DB}, video = {https://www.youtube.com/watch?v=si2HUS7idEs&list=PL3xUNnH4TdbsfndCMn02BqAAgGB0z7cwq}, title = {Efficient Uncertainty Tracking for Complex Queries with Attribute-level Bounds}, venueshort = {SIGMOD}, reproducibility = {https://github.com/fengsu91/AUDB_Reproducibility}, longversionurl = {https://arxiv.org/pdf/2102.11796}, year = {2021} }
Incomplete and probabilistic database techniques are principled methods for coping with uncertainty in data. Unfortunately, the class of queries that can be answered efficiently over such databases is severely limited, even when advanced approximation techniques are employed.We introduce attribute-annotated uncertain databases (AU-DBs), an uncertain data model that annotates tuples and attribute values with bounds to compactly approximate an incomplete database. AU-DBs are closed under relational algebra with aggregation using an efficient evaluation semantics. Using optimizations that trade accuracy for performance, our approach scales to complex queries and large datasets, and produces accurate results.
-
To not miss the forest for the trees - A holistic approach for explaining missing answers over nested data
Ralf Diestelkämper, Seokki Lee, Melanie Herschel and Boris Glavic
Proceedings of the 46th International Conference on Management of Data (2021), pp. 405–417.@inproceedings{DL21, author = {Diestelk{\"a}mper, Ralf and Lee, Seokki and Herschel, Melanie and Glavic, Boris}, booktitle = {Proceedings of the 46th International Conference on Management of Data}, pages = {405–417}, projects = {}, pdfurl = {https://dl.acm.org/doi/pdf/10.1145/3448016.3457249}, title = {To not miss the forest for the trees - A holistic approach for explaining missing answers over nested data}, doi = {10.1145/3448016.3457249}, video = {https://www.youtube.com/watch?v=q_YCcP5mGIk&list=PL3xUNnH4TdbsfndCMn02BqAAgGB0z7cwq}, keywords = {Provenance; Missing Answers}, venueshort = {SIGMOD}, longversionurl = {https://arxiv.org/pdf/2103.07561}, year = {2021} }
Query-based explanations for missing answers identify which operators of a query are responsible for the failure to return a missing answer of interest. This type of explanations has proven useful, e.g., to debug complex analytical queries. Such queries are frequent in big data systems such as Apache Spark. We present a novel approach to produce query-based explanations. It is the first to support nested data and to consider operators that modify the schema and structure of the data (e.g., nesting, projections) as potential causes of missing answers. To efficiently compute explanations, we propose a heuristic algorithm that applies two novel techniques: (i) reasoning about multiple schema alternatives for a query and (ii) re-validating at each step whether an intermediate result can contribute to the missing answer. Using an implementation on Spark, we demonstrate that our approach is the first to scale to large datasets while often finding explanations that existing techniques fail to identify.
-
Putting Things into Context: Rich Explanations for Query Answers using Join Graphs
Chenjie Li, Zhengjie Miao, Qitian Zeng, Boris Glavic and Sudeepa Roy
Proceedings of the 46th International Conference on Management of Data (2021), pp. 1051–1063.@inproceedings{LM21, author = {Li, Chenjie and Miao, Zhengjie and Zeng, Qitian and Glavic, Boris and Roy, Sudeepa}, booktitle = {Proceedings of the 46th International Conference on Management of Data}, pages = {1051–1063}, projects = {Explanations beyond Provenance}, title = {Putting Things into Context: Rich Explanations for Query Answers using Join Graphs}, pdfurl = {https://dl.acm.org/doi/pdf/10.1145/3448016.3459246}, doi = {10.1145/3448016.3459246}, keywords = {Provenance; Explanations}, venueshort = {SIGMOD}, reproducibility = {https://github.com/IITDBGroup/CaJaDe}, video = {https://www.youtube.com/watch?v=puhCAnFuPR4&list=PL3xUNnH4TdbsfndCMn02BqAAgGB0z7cwq}, longversionurl = {https://arxiv.org/pdf/2103.15797}, year = {2021} }
In many data analysis applications there is a need to explain why a surprising or interesting result was produced by a query. Previous approaches to explaining results have directly or indirectly relied on data provenance, i.e., input tuples contributing to the result(s) of interest. However, some information that is relevant for explaining an answer may not be contained in the provenance. We propose a new approach for explaining query results by augmenting provenance with information from other related tables in the database. Using a suite of optimization techniques, we demonstrate experimentally using real datasets and through a user study that our approach produces meaningful results and is efficient.
-
Data Provenance - Origins, Applications, Algorithms, and Models
Boris Glavic
Foundations and Trends® in Databases. 9, 3-4 (2021) , 209–441.@article{G21, title = {Data Provenance - Origins, Applications, Algorithms, and Models}, author = {Glavic, Boris}, journal = {Foundations and Trends® in Databases}, volume = {9}, doi = {10.1561/1900000068}, issn = {1931-7883}, year = {2021}, number = {3-4}, pages = {209-441}, url = {http://dx.doi.org/10.1561/1900000068}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/G21.pdf}, venueshort = {FnT}, publisher = {Now Publishers, Inc.} }
Data provenance has evolved from a niche topic to a main-stream area of research in databases and other research communities. This article gives a comprehensive introduction to data provenance. The main focus is on provenance in the context of databases. However, it will be insightful toalso consider connections to related research in programming languages, software engineering, semantic web, formal logic, and other communities. The target audience are researchers and practitioners that want to gain a solid understanding of data provenance and the state-of-the-art in this research area. The article only assumes that the reader has a basic understanding of database concepts, but not necessarily any prior exposure to provenance.
-
Trends in Explanations: Understanding and Debugging Data-driven Systems
Boris Glavic, Alexandra Meliou and Sudeepa Roy
Foundations and Trends® in Databases. 11, 3 (2021) , 226–318.@article{GMR21, title = {Trends in Explanations: Understanding and Debugging Data-driven Systems}, author = {Glavic, Boris and Meliou, Alexandra and Roy, Sudeepa}, journal = {Foundations and Trends® in Databases}, volume = {11}, doi = {10.1561/1900000074}, issn = {1931-7883}, number = {3}, year = {2021}, pages = {226-318}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GMR21.pdf}, url = {http://dx.doi.org/10.1561/1900000074}, venueshort = {FnT}, publisher = {Now Publishers, Inc.} }
Humans reason about the world around them by seeking to understand why and how something occurs. The same principle extends to the technology that so many of human activities increasingly rely on. Issues of trust, transparency, and understandability are critical in promoting adoption and proper use of systems. However, with increasing complexity of the systems and technologies we use, it is hard or even impossible to comprehend their function and behavior, and justify surprising observations through manual investigation alone. Explanation support can ease humans’ interactions with technology: explanations can help users understand a system’s function, justify system results, and increase their trust in automated decisions. Our goal in this article is to provide an overview of existing work in explanation support for data-driven processes, through a lens that identifies commonalities across varied problem settings and solutions. We suggest a classification of explainability requirements across three dimensions: the target of the explanation (“What”), the audience of the explanation (“Who”), and the purpose of the explanation (“Why”). We identify dominant themes across these dimensions and the high-level desiderata each implies, accompanied by several examples to motivate various problem settings. We discuss explainability solutions through the lens of the “How” dimension: How something is explained (the form of the explanation) and how explanations are derived (methodology). We conclude with a roadmap of possible research directions for the data management community within the field of explainability in data systems.
-
Debugging Missing Answers for Spark Queries over Nested Data with Breadcrumb
Ralf Diestelkämper, Seokki Lee, Melanie Herschel and Boris Glavic
Proceedings of the VLDB Endowment (Demonstration Track). 14, 12 (2021) , 2731–2734.@article{DL21a, author = {Diestelk{\"a}mper, Ralf and Lee, Seokki and Herschel, Melanie and Glavic, Boris}, keywords = {Provenance; Missing Answers}, title = {Debugging Missing Answers for Spark Queries over Nested Data with Breadcrumb}, journal = {Proceedings of the VLDB Endowment (Demonstration Track)}, pages = {2731 - 2734}, volume = {14}, issue = {12}, video = {https://www.youtube.com/watch?v=Y0uWqdtWGGw}, pdfurl = {http://vldb.org/pvldb/vol14/p2731-diestelkamper.pdf}, doi = {10.14778/3476311.3476331}, year = {2021}, venueshort = {{PVLDB}} }
-
Playing Fetch with CAT - Composing Cache Partitioning and Prefetching for Task-based Query Processing
Qitian Zeng, Kyle Hale and Boris Glavic
International Workshop on Data Management on New Hardware (2021).@inproceedings{ZHG21, author = {Zeng, Qitian and Hale, Kyle and Glavic, Boris}, title = {Playing Fetch with CAT - Composing Cache Partitioning and Prefetching for Task-based Query Processing}, keywords = {Cache; Main-memory Databases; HCDF}, booktitle = {International Workshop on Data Management on New Hardware}, projects = {HCDF}, venueshort = {DaMoN}, doi = {10.1145/3465998.3466016}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/ZH21.pdf}, year = {2021}, pages = {} }
-
Provenance and Annotation of Data and Processes - 8th and 9th International Provenance and Annotation Workshop, IPAW 2020 + IPAW 2021, Virtual Event, July 19-22, 2021, Proceedings
Boris Glavic, Vanessa Braganholo and David Koop editors
Springer.@proceedings{G21a, doi = {10.1007/978-3-030-80960-7}, editor = {Glavic, Boris and Braganholo, Vanessa and Koop, David}, isbn = {978-3-030-80959-1}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, timestamp = {Wed, 21 Jul 2021 15:51:06 +0200}, title = {Provenance and Annotation of Data and Processes - 8th and 9th International Provenance and Annotation Workshop, {IPAW} 2020 + {IPAW} 2021, Virtual Event, July 19-22, 2021, Proceedings}, url = {https://doi.org/10.1007/978-3-030-80960-7}, volume = {12839}, year = {2021} }
-
Provenance-based Data Skipping
Xing Niu, Ziyu Liu, Pengyuan Li, Boris Glavic, Dieter Gawlick, Vasudha Krishnaswamy, Zhen Hua Liu and Danica Porobic
Proceedings of the VLDB Endowment. 15, 3 (2021) , 451–464.@article{NL21, author = {Niu, Xing and Liu, Ziyu and Li, Pengyuan and Glavic, Boris and Gawlick, Dieter and Krishnaswamy, Vasudha and Liu, Zhen Hua and Porobic, Danica}, keywords = {Provenance, Data Skipping, Relevance-based Data Management}, title = {Provenance-based Data Skipping}, journal = {Proceedings of the VLDB Endowment}, projects = {Relevance-based Data Management}, pages = {451 - 464}, volume = {15}, issue = {3}, year = {2021}, doi = {10.14778/3494124.3494130}, venueshort = {{PVLDB}}, pdfurl = {https://vldb.org/pvldb/vol15/p451-niu.pdf}, longversionurl = {https://arxiv.org/pdf/2104.12815} }
Database systems use static analysis to determine upfront which data is needed for answering a query and use indexes and other physical design techniques to speed-up access to that data. However, for important classes of queries, e.g., HAVING and top-k queries, it is impossible to determine up-front what data is relevant. To overcome this limitation, we develop provenance-based data skipping (PBDS), a novel approach that generates provenance sketches to concisely encode what data is relevant for a query. Once a provenance sketch has been captured it is used to speed up subsequent queries. PBDS can exploit physical design artifacts such as indexes and zone maps.
-
DataSense: Display Agnostic Data Documentation
Poonam Kumari, Michael Brachmann, Oliver Kennedy, Su Feng and Boris Glavic
Proceedings of the 11th Conference on Innovative Data Systems (2021).@inproceedings{KB21, author = {Kumari, Poonam and Brachmann, Michael and Kennedy, Oliver and Feng, Su and Glavic, Boris}, title = {DataSense: Display Agnostic Data Documentation}, booktitle = {Proceedings of the 11th Conference on Innovative Data Systems}, year = {2021}, projects = {Vizier; UA-DB}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/KB21.pdf}, venueshort = {CIDR} }
2020
-
Your notebook is not crumby enough, REPLace it
Michael Brachmann, William Spoth, Oliver Kennedy, Boris Glavic, Heiko Müller, Sonia Castelo, Carlos Bautista and Juliana Freire
Proceedings of the 10th Conference on Innovative Data Systems (2020).@inproceedings{BS20, author = {Brachmann, Michael and Spoth, William and Kennedy, Oliver and Glavic, Boris and M\"{u}ller, Heiko and Castelo, Sonia and Bautista, Carlos and Freire, Juliana}, title = {Your notebook is not crumby enough, REPLace it}, booktitle = {Proceedings of the 10th Conference on Innovative Data Systems}, year = {2020}, projects = {Vizier}, keywords = {Notebooks; Vizier}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/BS20.pdf}, venueshort = {CIDR} }
Notebook and spreadsheet systems are currently the defacto standard for data collection, preparation, and analysis. However, these systems have been criticized for their lack of reproducibility, versioning, and support for sharing. These shortcomings are particularly detrimental for data curation where data scientists iteratively build workflows to clean up and integrate data as a prerequisite for analysis. We present Vizier, an open-source tool that helps analysts to build and refine data pipelines. Vizier combines the flexibility of notebooks with the easy-to-use data manipulation interface of spreadsheets. Combined with advanced provenance tracking for both data and computational steps this enables reproducibility, versioning, and streamlined data exploration. Unique to Vizier is that it exposes potential issues with data, no matter whether they already exist in the input or are introduced by the operations of a notebook. We refer to such potential errors as data caveats. Caveats are propagated alongside data using principled techniques from uncertain data management. Vizier provides extensive user interface support for caveats, e.g., exposing them as summaries in a dedicated error view and highlighting cells with caveats in spreadsheets.
-
Approximate Summaries for Why and Why-not Provenance
Seokki Lee, Bertram Ludäscher and Boris Glavic
Proceedings of the VLDB Endowment. 13, 6 (2020) , 912–924.@article{LL20, author = {Lee, Seokki and Lud\"ascher, Bertram and Glavic, Boris}, journal = {Proceedings of the VLDB Endowment}, keywords = {PUGS; Summarization; Sampling; Missing Answers; Datalog}, longversionurl = {https://arxiv.org/pdf/2002.00084}, projects = {PUGS}, number = {6}, pages = {912 - 924}, pdfurl = {http://www.vldb.org/pvldb/vol13/p912-lee.pdf}, title = {{Approximate Summaries for Why and Why-not Provenance}}, venueshort = {PVLDB}, volume = {13}, year = {2020} }
Why and why-not provenance have been studied extensively in recent years. However, why-not provenance and — to a lesser degree — why provenance can be very large, resulting in severe scalability and usability challenges. We introduce a novel approximate summarization technique for provenance to address these challenges. Our approach uses patterns to encode why and why-not provenance concisely. We develop techniques for efficiently computing provenance summaries that balance informativeness, conciseness, and completeness. To achieve scalability, we integrate sampling techniques into provenance capture and summarization. Our approach is the first to both scale to large datasets and to generate comprehensive and meaningful summaries.
-
Why and Why-Not Provenance for Queries with Negation
Seokki Lee
Illinois Institute of Technology.@phdthesis{lee-20-wwnpqn, venueshort = {PhD Thesis}, author = {Lee, Seokki}, keywords = {PUGS; Summarization; Missing Answers; Datalog}, month = may, pdfurl = {https://search-proquest-com.ezproxy.gl.iit.edu/pqdtglobal/docview/2424512806/fulltextPDF/4B76E2738F99473BPQ/1?accountid=28377}, project = {PUGS}, school = {Illinois Institute of Technology}, title = {Why and Why-Not Provenance for Queries with Negation}, year = {2020} }
2019
-
Provenance For Transactional Updates
Bahareh Arab
Illinois Institue of Technology.@phdthesis{A19, author = {Arab, Bahareh}, keywords = {Provenance; GProM; Reenactment}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/A19.pdf}, projects = {GProM}, school = {Illinois Institue of Technology}, title = {{Provenance For Transactional Updates}}, venueshort = {PhD Thesis}, year = {2019} }
Database provenance explains how results are derived by queries. However, many use cases such as auditing and debugging of transactions require understanding of how the current state of a database was derived by a transactional history. We introduce an approach for capturing the provenance of transactions. Our approach does not just work for serializable transactions but also non-serializable transaction such as read committed snapshot isolation (RC-SI). The main drivers of our approach are a provenance model for queries, updates, and transactions and reenactment, a novel technique for retroactively capturing the provenance of tuple versions. We introduce the MV-semirings provenance model for updates and transactions as an extension of the existing semiring provenance model for queries. Our reenactment technique exploits the time travel and audit logging capabilities of modern DBMS to replay parts of a transactional history using queries. Importantly, our technique requires no changes to the transactional workload or underlying DBMS and results in only moderate runtime overhead for transactions. Furthermore, we discuss how our MV-semirings model and reenactment approach can be used to serve a wide variety of applications and use cases including answering of historical what-if queries which determine the effect of hypothetical changes to past operations of a business, post- mortem debugging of transactions, and Provenance-aware Versioned Dataworkspaces (PVDs). We have implemented our approach on top of a commercial DBMS and our experiments confirm that by applying novel optimizations we can efficiently capture provenance for complex transactions over large data sets.
-
Heuristic and Cost-based Optimization for Diverse Provenance Tasks
Xing Niu, Raghav Kapoor, Boris Glavic, Dieter Gawlick, Zhen Hua Liu, Vasudha Krishnaswamy and Venkatesh Radhakrishnan
IEEE Transactions on Knowledge and Data Engineering. 31, 7 (2019) , 1267–1280.@article{NK18, author = {Niu, Xing and Kapoor, Raghav and Glavic, Boris and Gawlick, Dieter and Liu, Zhen Hua and Krishnaswamy, Vasudha and Radhakrishnan, Venkatesh}, doi = {10.1109/TKDE.2018.2827074}, journal = {IEEE Transactions on Knowledge and Data Engineering}, keywords = {Provenance; Optimization; GProM}, longversionurl = {https://arxiv.org/pdf/1804.07156.pdf}, number = {7}, pages = {1267--1280}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/NX18.pdf}, projects = {GProM}, title = {Heuristic and Cost-based Optimization for Diverse Provenance Tasks}, venueshort = {TKDE}, volume = {31}, year = {2019} }
-
Going Beyond Provenance: Explaining Query Answers with Pattern-based Counterbalances
Zhengjie Miao, Qitian Zeng, Boris Glavic and Sudeepa Roy
Proceedings of the 44th International Conference on Management of Data (2019), pp. 485–502.@inproceedings{MZ19, author = {Miao, Zhengjie and Zeng, Qitian and Glavic, Boris and Roy, Sudeepa}, booktitle = {Proceedings of the 44th International Conference on Management of Data}, keywords = {Provenance; Explanations; Cape}, pages = {485--502}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/MZ19.pdf}, reproducibility = {https://hub.docker.com/repository/docker/iitdbgroup/2019-sigmod-reproducibility-cape}, doi = {10.1145/3299869.3300066}, video = {https://av.tib.eu/media/42903}, projects = {Explanations beyond Provenance}, slideurl = {https://www.slideshare.net/lordPretzel/2019-sigmod-going-beyond-provenance-explaining-query-answers-with-patternbased-counterbalances}, title = {Going Beyond Provenance: Explaining Query Answers with Pattern-based Counterbalances}, venueshort = {SIGMOD}, year = {2019} }
Provenance and intervention-based techniques have been used to explain surprisingly high or low outcomes of aggregation queries. However, such techniques may miss interesting explanations emerging from data that is not in the provenance. For instance, an unusually low number of publications of a prolific researcher in a certain venue and year can be explained by an increased number of publications in another venue in the same year. We present a novel approach for explaining outliers in aggregation queries through counterbalancing. That is, explanations are outliers in the opposite direction of the outlier of interest. Outliers are defined w.r.t. patterns that hold over the data in aggregate. We present efficient methods for mining such aggregate regression patterns (ARPs), discuss how to use ARPs to generate and rank explanations, and experimentally demonstrate the efficiency and effectiveness of our approach.
-
Snapshot Semantics for Temporal Multiset Relations
Anton Dignös, Boris Glavic, Xing Niu, Michael H. Böhlen and Johann Gamper
Proceedings of the VLDB Endowment. 12, 6 (2019) , 639–652.@article{DG19, author = {Dign\"{o}s, Anton and Glavic, Boris and Niu, Xing and B\"{o}hlen, Michael H. and Gamper, Johann}, journal = {Proceedings of the VLDB Endowment}, keywords = {Temporal Databases; Annotations}, longversionurl = {https://arxiv.org/pdf/1902.04938}, reproducibility = {https://github.com/IITDBGroup/2019-PVLDB-Reproducibility-Snapshot-Semantics-For-Temporal-Multiset-Relations}, projects = {Snapshot Semantics for Temporal Databases}, number = {6}, pages = {639--652}, pdfurl = {http://www.vldb.org/pvldb/vol12/p639-dignoes.pdf}, title = {{Snapshot Semantics for Temporal Multiset Relations}}, venueshort = {PVLDB}, volume = {12}, year = {2019} }
Snapshot semantics is widely used for evaluating queries over temporal data: temporal relations are seen as sequences of snapshot relations, and queries are evaluated at each snapshot. In this work, we demonstrate that current approaches for snapshot semantics over interval-timestamped multiset relations are subject to two bugs regarding snapshot aggregation and bag difference. We introduce a novel temporal data model based on K-relations that overcomes these bugs and prove it to correctly encode snapshot semantics. Furthermore, we present an efficient implementation of our model as a database middleware and demonstrate experimentally that our approach is competitive with native implementations.
-
A High-Performance Distributed Relational Database System for Scalable OLAP Processing
Jason Arnold, Boris Glavic and Ioan Raicu
Proceedings of the 33rd IEEE International Parallel and Distributed Processing Symposium (2019), pp. 738–748.@inproceedings{AG19, author = {Arnold, Jason and Glavic, Boris and Raicu, Ioan}, booktitle = {Proceedings of the 33rd IEEE International Parallel and Distributed Processing Symposium}, keywords = {HRDBMS}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG19.pdf}, projects = {HRDBMS}, pages = {738-748}, doi = {10.1109/IPDPS.2019.00083}, title = {{A High-Performance Distributed Relational Database System for Scalable OLAP Processing}}, venueshort = {IPDPS}, year = {2019} }
The scalability of systems such as Hive and Spark SQL that are built on top of big data platforms have enabled OLAP processing over very large data sets. However, the per-node performance of these systems is typically low compared to traditional relational databases. Conversely, Massively Parallel Processing (MPP) databases do not scale as well as these systems. We present HRDBMS, a fully implemented distributed shared-nothing relational database developed with the goal of improving the scalability of OLAP queries. HRDBMS achieves high scalability through a principled combination of techniques from relational and big data systems with novel communication and work-distribution techniques. We also support serializable transactions for compatibility even though the system has not been optimized for this. HRDBMS runs on a custom distributed and asynchronous execution engine that was built from the ground up to support highly parallelized operator implemen- tations. Our experimental comparison with Hive, Spark SQL, and Greenplum confirms that HRDBMS’s scalability is on par with Hive and Spark SQL (up to 96 nodes) while its per-node performance can compete with MPP databases like Greenplum.
-
Uncertainty Annotated Databases - A Lightweight Approach for Approximating Certain Answers
Su Feng, Aaron Huber, Boris Glavic and Oliver Kennedy
Proceedings of the 44th International Conference on Management of Data (2019), pp. 1313–1330.@inproceedings{FH19, author = {Feng, Su and Huber, Aaron and Glavic, Boris and Kennedy, Oliver}, booktitle = {Proceedings of the 44th International Conference on Management of Data}, keywords = {UA-DB; Vizier}, longversionurl = {https://arxiv.org/pdf/1904.00234}, pages = {1313-1330}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/FH19.pdf}, reproducibility = {https://github.com/IITDBGroup/UADB_Reproducibility}, projects = {Vizier; UA-DB}, video = {https://av.tib.eu/media/43062}, doi = {10.1145/3299869.3319887}, slideurl = {https://www.slideshare.net/lordPretzel/2019-sigmod-uncertainty-annotated-databases-a-lightweight-approach-for-approximating-certain-answers}, title = {Uncertainty Annotated Databases - A Lightweight Approach for Approximating Certain Answers}, venueshort = {SIGMOD}, year = {2019} }
Certain answers are a principled method for coping with uncertainty that arises in many practical data management tasks. Unfortunately, this method is expensive and may exclude useful (if uncertain) answers. Thus, users frequently resort to less principled approaches to resolve uncertainty. In this paper, we propose Uncertainty Annotated Databases (UA-DBs), which combine an under- and over-approximation of certain answers to achieve the reliability of certain answers, with the performance of a classical database system. Furthermore, in contrast to prior work on certain answers, UA-DBs achieve a higher utility by including some (explicitly marked) answers that are not certain. UA-DBs are based on incomplete K-relations, which we introduce to generalize the classical set-based notion of incomplete databases and certain answers to a much larger class of data models. Using an implementation of our approach, we demonstrate experimentally that it efficiently produces tight approximations of certain answers that are of high utility.
-
Analyzing Uncertain Tabular Data
Oliver Kennedy and Boris Glavic
Information Quality in Information Fusion and Decision Making
Éloi Bossé and G. Rogova, eds. Springer. 291–320.@incollection{KG19, author = {Kennedy, Oliver and Glavic, Boris}, booktitle = {Information Quality in Information Fusion and Decision Making}, doi = {10.1007/978-3-030-03643-0_12}, editor = {\'{E}loi Boss\'{e} and Rogova, Galina}, keywords = {Uncertainty; UA-DB}, pages = {291-320}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/KG19.pdf}, projects = {Mimir; UA-DB; Vizier}, publisher = {Springer}, title = {Analyzing Uncertain Tabular Data}, venueshort = {Information Quality in Information Fusion and Decision Making}, year = {2019} }
It is common practice to spend considerable time refining source data to address issues of data quality before beginning any data analysis. For example, an analyst might impute missing values or detect and fuse duplicate records representing the same real-world entity. However, there are many situations where there are multiple possible candidate resolutions for a data quality issue, but there is not sufficient evidence for determining which of the resolutions is the most appropriate. In this case, the only way forward is to make assumptions to restrict the space of solutions and/or to heuristically choose a resolution based on characteristics that are deemed predictive of “good” resolutions. Although it is important for the analyst to understand the impact of these assumptions and heuristic choices on her results, evaluating this impact can be highly non-trivial and time consuming. For several decades now, the fields of probabilistic, incomplete, and fuzzy databases have developed strategies for analyzing the impact of uncertainty on the outcome of analyses. This general family of uncertainty-aware databases aims to model ambiguity in the results of analyses expressed in standard languages like SQL, SparQL, R, or Spark. An uncertainty-aware database uses descriptions of potential errors and ambiguities in source data to derive a corresponding description of potential errors or ambiguities in the result of an analysis accessing this source data. Depending on technique, these descriptions of uncertainty may be either quantitative (bounds, probabilities), or qualitative (certain outcomes, unknown values, explanations of uncertainty). In this chapter, we explore the types of problems that techniques from uncertainty-aware databases address, survey solutions to these problems, and highlight their application to fixing data quality issues.
-
Data Debugging and Exploration with Vizier
Mike Brachmann, Carlos Bautista, Sonia Castelo, Su Feng, Juliana Freire, Boris Glavic, Oliver Kennedy, Heiko Müller, Rémi Rampin, William Spoth and Ying Yang
Proceedings of the 44th International Conference on Management of Data (Demonstration Track) (2019), pp. 1877–1880.@inproceedings{BB19, author = {Brachmann, Mike and Bautista, Carlos and Castelo, Sonia and Feng, Su and Freire, Juliana and Glavic, Boris and Kennedy, Oliver and M{\"u}ller, Heiko and Rampin, R{\'e}mi and Spoth, William and Yang, Ying}, booktitle = {Proceedings of the 44th International Conference on Management of Data (Demonstration Track)}, date-modified = {2019-04-04 12:25:42 -0500}, keywords = {Vizier}, pages = {1877-1880}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/BB19.pdf}, projects = {Vizier}, video = {https://www.youtube.com/watch?v=c3ICB-17kRY&t=4s}, doi = {10.1145/3299869.3320246}, title = {Data Debugging and Exploration with Vizier}, venueshort = {SIGMOD}, year = {2019} }
We present Vizier, a multi-modal data exploration and debugging tool. The system supports a wide range of operations by seamlessly integrating Python, SQL, and automated data curation and debugging methods. Using Spark as an execution backend, Vizier handles large datasets in multiple formats. Ease-of-use is attained through integration of a notebook with a spreadsheet-style interface and with visualizations that guide and support the user in the loop. In addition, native support for provenance and versioning enable collaboration and uncertainty management. In this demonstration we will illustrate the diverse features of the system using several realistic data science tasks based on real data.
-
CAPE: Explaining Outliers by Counterbalancing
Zhengjie Miao, Qitian Zeng, Chenjie Li, Boris Glavic, Oliver Kennedy and Sudeepa Roy
Proceedings of the VLDB Endowment (Demonstration Track). 12, 12 (2019) , 1806–1809.@article{MZ19a, author = {Miao, Zhengjie and Zeng, Qitian and Li, Chenjie and Glavic, Boris and Kennedy, Oliver and Roy, Sudeepa}, date-modified = {2019-08-02 09:14:13 -0500}, journal = {Proceedings of the VLDB Endowment (Demonstration Track)}, keywords = {Outliers; Intervention; Cape; Explanations}, pdfurl = {http://www.vldb.org/pvldb/vol12/p1806-miao.pdf}, projects = {Explanations beyond Provenance}, pages = {1806-1809}, volume = {12}, issue = {12}, doi = {10.14778/3352063.3352071}, title = {CAPE: Explaining Outliers by Counterbalancing}, venueshort = {{PVLDB}}, year = {2019} }
In this demonstration we showcase Cape, a system that ex- plains surprising aggregation outcomes. In contrast to previous work which relies exclusively on provenance, Cape applies a novel approach for explaining outliers in aggregation queries through counterbalancing (outliers in the opposite direction). The foundation of our approach are aggregate regression patterns (ARPs) based on which we defined outliers, and an efficient explanation generation algorithm that utilizes these patterns. In the demonstration, the audience can run aggregation queries over real world datasets, and browse the patterns and explanations returned by Cape for outliers in the result of such queries.
-
Query-based Why-not Explanations for Nested Data
Ralf Diestelkämper, Boris Glavic, Melanie Herschel and Seokki Lee
Proceedings of the 11th USENIX Workshop on the Theory and Practice of Provenance (2019).@inproceedings{DG19a, author = {Diestelk\"amper, Ralf and Glavic, Boris and Herschel, Melanie and Lee, Seokki}, booktitle = {Proceedings of the 11th USENIX Workshop on the Theory and Practice of Provenance}, isworkshop = {true}, keywords = {Provenance; Missing Answers}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/DG19.pdf}, title = {Query-based Why-not Explanations for Nested Data}, venueshort = {TaPP}, year = {2019} }
We present the first query-based approach for explaining missing answers to queries over nested relational data which is a common data format used by big data systems such as Apache Spark. Our main contributions are a novel way to define query-based why-not provenance based on repairs to queries and presenting an implementation and preliminary experiments for answering such queries in Spark.
-
PUG: a framework and practical implementation for why and why-not provenance
Seokki Lee, Bertram Ludäscher and Boris Glavic
The VLDB Journal. 28, 1 (Aug. 2019) , 47—71.@article{LL18, author = {Lee, Seokki and Lud{\"a}scher, Bertram and Glavic, Boris}, date-added = {2018-08-29 19:09:06 -0500}, date-modified = {2018-08-29 19:09:33 -0500}, day = {23}, doi = {10.1007/s00778-018-0518-5}, issn = {0949-877X}, issue = {1}, journal = {The VLDB Journal}, keywords = {Datalog; Provenance; Missing Answers; Semirings; PUGS}, longversionurl = {https://arxiv.org/pdf/1808.05752.pdf}, month = aug, pages = {47---71}, projects = {PUGS}, title = {PUG: a framework and practical implementation for why and why-not provenance}, venueshort = {VLDBJ}, volume = {28}, year = {2019} }
Explaining why an answer is (or is not) returned by a query is important for many applications including auditing, debugging data and queries, and answering hypothetical questions about data. In this work, we present the first practical approach for answering such questions for queries with negation (first-order queries). Specifically, we introduce a graph-based provenance model that, while syntactic in nature, supports reverse reasoning and is proven to encode a wide range of provenance models from the literature. The implementation of this model in our PUG (Provenance Unification through Graphs) system takes a provenance question and Datalog query as an input and generates a Datalog program that computes an explanation, i.e., the part of the provenance that is relevant to answer the question. Furthermore, we demonstrate how a desirable factorization of provenance can be achieved by rewriting an input query. We experimentally evaluate our approach demonstrating its efficiency.
2018
-
Using Reenactment to Retroactively Capture Provenance for Transactions
Bahareh Arab, Dieter Gawlick, Vasudha Krishnaswamy, Venkatesh Radhakrishnan and Boris Glavic
IEEE Transactions on Knowledge and Data Engineering. 30, 3 (2018) , 599–612.@article{AG17c, author = {Arab, Bahareh and Gawlick, Dieter and Krishnaswamy, Vasudha and Radhakrishnan, Venkatesh and Glavic, Boris}, doi = {10.1109/TKDE.2017.2769056}, journal = {IEEE Transactions on Knowledge and Data Engineering}, keywords = {Provenance; GProM; Reenactment; Concurrency Control}, number = {3}, pages = {599--612}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG17c.pdf}, projects = {GProM; Reenactment}, title = {Using Reenactment to Retroactively Capture Provenance for Transactions}, venueshort = {TKDE}, volume = {30}, year = {2018} }
-
GProM - A Swiss Army Knife for Your Provenance Needs
Bahareh Arab, Su Feng, Boris Glavic, Seokki Lee, Xing Niu and Qitian Zeng
IEEE Data Engineering Bulletin. 41, 1 (2018) , 51–62.@article{AF18, author = {Arab, Bahareh and Feng, Su and Glavic, Boris and Lee, Seokki and Niu, Xing and Zeng, Qitian}, bibsource = {dblp computer science bibliography, https://dblp.org}, biburl = {https://dblp.org/rec/bib/journals/debu/ArabFGLNZ17}, journal = {{IEEE} Data Engineering Bulletin}, keywords = {GProM; Provenance; Annotations}, number = {1}, pages = {51--62}, pdfurl = {http://sites.computer.org/debull/A18mar/p51.pdf}, projects = {GProM; Reenactment}, timestamp = {Fri, 02 Mar 2018 18:50:49 +0100}, title = {{GProM} - {A} Swiss Army Knife for Your Provenance Needs}, venueshort = {Data Eng. Bull.}, volume = {41}, year = {2018}, bdsk-url-1 = {http://sites.computer.org/debull/A18mar/p51.pdf} }
-
Guest editorial: large-scale data curation and metadata management
Mohamed Eltabakh and Boris Glavic editors
Springer.@proceedings{eltabakh2018guest, doi = {10.1007/s10619-017-7217-x}, editor = {Eltabakh, Mohamed and Glavic, Boris}, journal = {Distributed and Parallel Databases}, pages = {1--4}, pdfurl = {https://link.springer.com/article/10.1007/s10619-017-7217-x}, publisher = {Springer}, title = {Guest editorial: large-scale data curation and metadata management}, venueshort = {DAPD}, year = {2018} }
-
Snapshot Semantics for Temporal Multiset Relations (extended version)
Anton Dignös, Boris Glavic, Xing Niu, Michael H. Böhlen and Johann Gamper
Technical Report #IIT/CS-DB-2018-03
Illinois Institute of Technology.@techreport{DG18, author = {Dign\"{o}s, Anton and Glavic, Boris and Niu, Xing and B\"{o}hlen, Michael H. and Gamper, Johann}, institution = {Illinois Institute of Technology}, keywords = {Temporal Databases}, number = {IIT/CS-DB-2018-03}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/DG18.pdf}, title = {Snapshot Semantics for Temporal Multiset Relations (extended version)}, venueshort = {Techreport}, projects = {Snapshot Semantics for Temporal Databases}, year = {2018} }
-
Improving Data-Shuffle Performance In Data-Parallel Distributed Systems
Shweelan Samson
Illinois Institute of Technology.@mastersthesis{S18, author = {Samson, Shweelan}, date-added = {2018-08-20 18:55:49 +0000}, date-modified = {2018-08-20 18:55:49 +0000}, keywords = {Big Data; HRDBMS; Distributed Databases}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/S18.pdf}, projects = {HRDBMS}, school = {Illinois Institute of Technology}, title = {{Improving Data-Shuffle Performance In Data-Parallel Distributed Systems}}, venueshort = {Master Thesis}, year = {2018}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/S18.pdf} }
-
Provenance Summaries for Answers and Non-Answers
Seokki Lee, Bertram Ludäscher and Boris Glavic
Proceedings of the VLDB Endowment (Demonstration Track). 11, 12 (2018) , 1954–1957.@article{LGG18, author = {Lee, Seokki and Lud{\"{a}}scher, Bertram and Glavic, Boris}, journal = {Proceedings of the VLDB Endowment (Demonstration Track)}, keywords = {PUGS; Datalog; Provenance; Missing Answers}, number = {12}, pages = {1954--1957}, pdfurl = {http://www.vldb.org/pvldb/vol11/p1954-lee.pdf}, projects = {PUGS}, title = {Provenance Summaries for Answers and Non-Answers}, venueshort = {{PVLDB}}, volume = {11}, year = {2018} }
-
Let’s Make It Dirty with BART!
Donatello Santoro, Patricia C. Arocena, Boris Glavic, Giansalvatore Mecca, Renée J. Miller and Paolo Papotti
Proceedings of the 26th Italian Symposium on Advanced Database Systems (2018).@inproceedings{SAG18, author = {Santoro, Donatello and Arocena, Patricia C. and Glavic, Boris and Mecca, Giansalvatore and Miller, Ren{\'{e}}e J. and Papotti, Paolo}, booktitle = {Proceedings of the 26th Italian Symposium on Advanced Database Systems}, crossref = {DBLP:conf/sebd/2018}, keywords = {BART; Data Cleaning}, pdfurl = {http://ceur-ws.org/Vol-2161/paper14.pdf}, projects = {BART}, title = {Let's Make It Dirty with BART!}, venueshort = {SEBD}, year = {2018} }
2017
-
Carving database storage to detect and trace security breaches
James Wagner, Alexander Rasin, Boris Glavic, Karen Heart, Jacob Furst, Lucas Bressan and Jonathan Grier
Digital Investigation. 22, (2017) , S127–S136.@article{wagner2017carving, author = {Wagner, James and Rasin, Alexander and Glavic, Boris and Heart, Karen and Furst, Jacob and Bressan, Lucas and Grier, Jonathan}, journal = {Digital Investigation}, pages = {S127--S136}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/WR17.pdf}, publisher = {Elsevier}, title = {Carving database storage to detect and trace security breaches}, venueshort = {Digital Investigation}, volume = {22}, year = {2017}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/WR17.pdf} }
-
DeepSea: Adaptive Workload-Aware Partitioning of Materialized Views in Scalable Data Analytics
Jiang Du, Boris Glavic, Wei Tan and Renée J. Miller
Proceedings of the 20th International Conference on Extending Database Technology (2017), pp. 198–209.@inproceedings{DG17, author = {Du, Jiang and Glavic, Boris and Tan, Wei and Miller, Ren{\'e}e J.}, booktitle = {Proceedings of the 20th International Conference on Extending Database Technology}, keywords = {DeepSea; Big Data; Materialized Views; Partitioning}, pages = {198-209}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/JG17.pdf}, projects = {DeepSea}, title = {{DeepSea: Adaptive Workload-Aware Partitioning of Materialized Views in Scalable Data Analytics}}, venueshort = {EDBT}, year = {2017}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/JG17.pdf} }
-
Adaptive Schema Databases
William Spoth, Bahareh Arab, Eric S. Chan, Dieter Gawlick, Adel Ghoneimy, Boris Glavic, Beda Hammerschmidt, Oliver Kennedy, Seokki Lee, Zhen Hua Liu, Xing Niu and Ying Yang
Proceedings of the 8th Biennial Conference on Innovative Data Systems (2017).@inproceedings{SA17, author = {Spoth, William and Arab, Bahareh and Chan, Eric S. and Gawlick, Dieter and Ghoneimy, Adel and Glavic, Boris and Hammerschmidt, Beda and Kennedy, Oliver and Lee, Seokki and Liu, Zhen Hua and Niu, Xing and Yang, Ying}, booktitle = {Proceedings of the 8th Biennial Conference on Innovative Data Systems}, keywords = {Schema Evolution; Data Integration}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/SA17.pdf}, projects = {Vizier}, title = {{Adaptive Schema Databases}}, venueshort = {CIDR}, year = {2017}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/SA17.pdf} }
-
A SQL-Middleware Unifying Why and Why-Not Provenance for First-Order Queries
Seokki Lee, Sven Köhler, Bertram Ludäscher and Boris Glavic
Proceedings of the 33rd IEEE International Conference on Data Engineering (2017), pp. 485–496.@inproceedings{LS17, author = {Lee, Seokki and K\"{o}hler, Sven and Lud\"{a}scher, Bertram and Glavic, Boris}, booktitle = {Proceedings of the 33rd IEEE International Conference on Data Engineering}, keywords = {Provenance; Datalog; GProM; Missing Answers; Game Provenance; PUGS}, pages = {485-496}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/LS17.pdf}, projects = {GProM; PUGS}, title = {A SQL-Middleware Unifying Why and Why-Not Provenance for First-Order Queries}, venueshort = {ICDE}, year = {2017} }
-
Answering Historical What-if Queries with Provenance, Reenactment, and Symbolic Execution
Bahareh Arab and Boris Glavic
Proceedings of the 8th USENIX Workshop on the Theory and Practice of Provenance (2017).@inproceedings{AG17b, author = {Arab, Bahareh and Glavic, Boris}, booktitle = {Proceedings of the 8th USENIX Workshop on the Theory and Practice of Provenance}, isworkshop = {true}, keywords = {Provenance;Reenactment;What-if}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG17b.pdf}, projects = {GProM;Reenactment}, title = {Answering Historical What-if Queries with Provenance, Reenactment, and Symbolic Execution}, venueshort = {TaPP}, year = {2017}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG17b.pdf} }
-
Integrating Approximate Summarization with Provenance Capture
Seokki Lee, Xing Niu, Bertram Ludäscher and Boris Glavic
Proceedings of the 8th USENIX Workshop on the Theory and Practice of Provenance (2017).@inproceedings{SN17, author = {Lee, Seokki and Niu, Xing and Lud\"{a}scher, Bertram and Glavic, Boris}, booktitle = {Proceedings of the 8th USENIX Workshop on the Theory and Practice of Provenance}, isworkshop = {true}, keywords = {Provenance; Datalog; GProM; Missing Answers; Game Provenance; PUGS}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/SN17.pdf}, projects = {GProM; PUGS}, title = {Integrating Approximate Summarization with Provenance Capture}, venueshort = {TaPP}, year = {2017}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/SN17.pdf} }
-
Debugging Transactions and Tracking their Provenance with Reenactment
Xing Niu, Boris Glavic, Seokki Lee, Bahareh Arab, Dieter Gawlick, Zhen Hua Liu, Vasudha Krishnaswamy, Su Feng and Xun Zou
Proceedings of the VLDB Endowment (Demonstration Track). 10, 12 (2017) , 1857–1860.@article{NG17, author = {Niu, Xing and Glavic, Boris and Lee, Seokki and Arab, Bahareh and Gawlick, Dieter and Liu, Zhen Hua and Krishnaswamy, Vasudha and Feng, Su and Zou, Xun}, journal = {Proceedings of the VLDB Endowment (Demonstration Track)}, keywords = {Provenance; GProM; Reenactment; Debugging; Concurrency Control; Reenactment}, number = {12}, pages = {1857--1860}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/XG17.pdf}, projects = {GProM; Reenactment}, title = {Debugging Transactions and Tracking their Provenance with Reenactment}, venueshort = {PVLDB}, volume = {10}, year = {2017}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/XG17.pdf} }
-
Provenance-aware Query Optimization
Xing Niu, Raghav Kapoor, Boris Glavic, Dieter Gawlick, Zhen Hua Liu, Vasudha Krishnaswamy and Venkatesh Radhakrishnan
Proceedings of the 33rd IEEE International Conference on Data Engineering (2017), pp. 473–484.@inproceedings{XN17, author = {Niu, Xing and Kapoor, Raghav and Glavic, Boris and Gawlick, Dieter and Liu, Zhen Hua and Krishnaswamy, Vasudha and Radhakrishnan, Venkatesh}, booktitle = {Proceedings of the 33rd IEEE International Conference on Data Engineering}, keywords = {Provenance; Cost-based optimization; Query instrumentation; Annotation propagation; GProM}, pages = {473-484}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/XN17.pdf}, projects = {GProM}, title = {Provenance-aware Query Optimization}, venueshort = {ICDE}, year = {2017} }
2016
-
Implementing Unified Why- and Why-Not Provenance Through Games
Seokki Lee, Sven Köhler, Bertram Ludäscher and Boris Glavic
Proceedings of the 8th USENIX Workshop on the Theory and Practice of Provenance (Poster) (2016).@inproceedings{LS16, author = {Lee, Seokki and K\"{o}hler, Sven and Lud\"{a}scher, Bertram and Glavic, Boris}, booktitle = {Proceedings of the 8th USENIX Workshop on the Theory and Practice of Provenance (Poster)}, isworkshop = {true}, keywords = {Provenance; Game Provenance; Datalog; GProM; Missing Answers; PUGS}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/LS16.pdf}, projects = {PUGS}, title = {{Implementing Unified Why- and Why-Not Provenance Through Games}}, venueshort = {TaPP}, year = {2016}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/LS16.pdf} }
-
Mimir: Bringing CTables into Practice
Arindam Nandi, Ying Yang, Oliver Kennedy, Boris Glavic, Ronny Fehling, Zhen Hua Liu and Dieter Gawlick
Technical Report #arXiv:1601.00073
CoRR.@techreport{NY16a, author = {Nandi, Arindam and Yang, Ying and Kennedy, Oliver and Glavic, Boris and Fehling, Ronny and Liu, Zhen Hua and Gawlick, Dieter}, institution = {CoRR}, keywords = {Probabilistic Databases; Data Cleaning; Mimir}, number = {arXiv:1601.00073}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/NY16a.pdf}, projects = {Mimir}, title = {Mimir: Bringing CTables into Practice}, venueshort = {Techreport}, year = {2016} }
-
Provenance-aware Versioned Dataworkspaces
Xing Niu, Bahareh Arab, Dieter Gawlick, Zhen Hua Liu, Vasudha Krishnaswamy, Oliver Kennedy and Boris Glavic
Proceedings of the 8th USENIX Workshop on the Theory and Practice of Provenance (2016).@inproceedings{XN16, author = {Niu, Xing and Arab, Bahareh and Gawlick, Dieter and Liu, Zhen Hua and Krishnaswamy, Vasudha and Kennedy, Oliver and Glavic, Boris}, booktitle = {Proceedings of the 8th USENIX Workshop on the Theory and Practice of Provenance}, isworkshop = {true}, keywords = {Provenance; GProM; Data Cleaning}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/XN16.pdf}, projects = {GProM}, title = {Provenance-aware Versioned Dataworkspaces}, venueshort = {TaPP}, year = {2016} }
-
The Exception that Improves the Rule
Juliana Freire, Boris Glavic, Oliver Kennedy and Heiko Müller
SIGMOD Workshop on Human-In-the-Loop Data Analytics (2016).@inproceedings{FG16, author = {Freire, Juliana and Glavic, Boris and Kennedy, Oliver and M\"{u}ller, Heiko}, booktitle = {SIGMOD Workshop on Human-In-the-Loop Data Analytics}, isworkshop = {true}, keywords = {Vizier; Provenance; Data Cleaning}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/FG16.pdf}, projects = {Vizier}, title = {{The Exception that Improves the Rule}}, venueshort = {HILDA}, year = {2016}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/FG16.pdf} }
-
Benchmarking Data Curation Systems
Patricia C. Arocena, Boris Glavic, Giansalvatore Mecca, Renée J. Miller, Paolo Papotti and Donatello Santoro
IEEE Data Engineering Bulletin. 39, 2 (2016) , 47–62.@article{AGG16, author = {Arocena, Patricia C. and Glavic, Boris and Mecca, Giansalvatore and Miller, Ren{\'{e}}e J. and Papotti, Paolo and Santoro, Donatello}, journal = {{IEEE} Data Engineering Bulletin}, keywords = {iBench; BART}, number = {2}, pages = {47--62}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AGG16.pdf}, projects = {iBench; BART}, title = {Benchmarking Data Curation Systems}, venueshort = {Data Eng. Bull.}, volume = {39}, year = {2016} }
-
Provenance and Annotation of Data and Processes - 6th International Provenance and Annotation Workshop, IPAW 2016, McLean, VA, USA, June 7-8, 2016, Proceedings
Marta Mattoso and Boris Glavic editors
Springer.@proceedings{MG16, doi = {10.1007/978-3-319-40593-3}, editor = {Mattoso, Marta and Glavic, Boris}, isbn = {978-3-319-40592-6}, pdfurl = {http://dx.doi.org/10.1007/978-3-319-40593-3}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Provenance and Annotation of Data and Processes - 6th International Provenance and Annotation Workshop, {IPAW} 2016, McLean, VA, USA, June 7-8, 2016, Proceedings}, venueshort = {IPAW}, volume = {9672}, year = {2016} }
-
BART in Action: Error Generation and Empirical Evaluations of Data-Cleaning Systems
Donatello Santoro, Patricia C. Arocena, Boris Glavic, Giansalvatore Mecca, Renée J. Miller and Paolo Papotti
Proceedings of the 42nd International Conference on Management of Data (SIGMOD) (Demonstration Track) (2016), pp. 2161–2164.@inproceedings{SA16, author = {Santoro, Donatello and Arocena, Patricia C. and Glavic, Boris and Mecca, Giansalvatore and Miller, Ren{\'{e}}e J. and Papotti, Paolo}, booktitle = {Proceedings of the 42nd International Conference on Management of Data ({SIGMOD}) (Demonstration Track)}, keywords = {BART}, pages = {2161--2164}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/SA16.pdf}, projects = {BART}, title = {{BART} in Action: Error Generation and Empirical Evaluations of Data-Cleaning Systems}, venueshort = {SIGMOD}, year = {2016} }
-
Reenactment for Read-Committed Snapshot Isolation
Bahareh Arab, Dieter Gawlick, Vasudha Krishnaswamy, Venkatesh Radhakrishnan and Boris Glavic
Proceedings of the 25th ACM International Conference on Information and Knowledge Management (2016), pp. 841–850.@inproceedings{AG17, author = {Arab, Bahareh and Gawlick, Dieter and Krishnaswamy, Vasudha and Radhakrishnan, Venkatesh and Glavic, Boris}, booktitle = {Proceedings of the 25th ACM International Conference on Information and Knowledge Management}, keywords = {Provenance; Concurrency Control; Reenactment; GProM}, pages = {841--850}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG17.pdf}, longversionurl = {https://arxiv.org/pdf/1608.08258}, projects = {GProM; Reenactment}, title = {Reenactment for Read-Committed Snapshot Isolation}, venueshort = {CIKM}, year = {2016} }
-
Reenactment for Read-Committed Snapshot Isolation (long version)
Bahareh Arab, Dieter Gawlick, Vasudha Krishnaswamy, Venkatesh Radhakrishnan and Boris Glavic
Illinois Institute of Technology.@techreport{AG17a, author = {Arab, Bahareh and Gawlick, Dieter and Krishnaswamy, Vasudha and Radhakrishnan, Venkatesh and Glavic, Boris}, institution = {Illinois Institute of Technology}, keywords = {Provenance; Concurrency Control; Reenactment; GProM}, pdfurl = {http://cs.iit.edu/%7Edbgroup/assets/pdfpubls/AG16a.pdf}, projects = {GProM; Reenactment}, title = {Reenactment for Read-Committed Snapshot Isolation (long version)}, venueshort = {Techreport}, year = {2016} }
-
Optimizing Provenance Capture and Queries - Algebraic Transformations and Cost-based Optimization
Xing Niu and Boris Glavic
Technical Report #IIT/CS-DB-2016-02
Illinois Institute of Technology.@techreport{XN16a, author = {Niu, Xing and Glavic, Boris}, date-added = {2016-09-17 20:07:29 +0000}, date-modified = {2016-09-17 20:09:08 +0000}, institution = {Illinois Institute of Technology}, keywords = {Provenance; Query Optimization; GProM}, number = {IIT/CS-DB-2016-02}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/XN16a.pdf}, projects = {GProM}, title = {Optimizing Provenance Capture and Queries - Algebraic Transformations and Cost-based Optimization}, venueshort = {Techreport}, year = {2016}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/XN16a.pdf} }
-
Efficiently Computing Provenance Graphs for Queries with Negation
Seokki Lee, Sven Köhler, Bertram Ludäscher and Boris Glavic
Technical Report #IIT/CS-DB-2016-03
Illinois Institute of Technology.@techreport{LS16a, author = {Lee, Seokki and K\"{o}hler, Sven and Lud\"{a}scher, Bertram and Glavic, Boris}, date-modified = {2016-10-20 12:15:28 +0000}, institution = {Illinois Institute of Technology}, keywords = {Provenance; Datalog; GProM; Missing Answers}, number = {IIT/CS-DB-2016-03}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/LS16a.pdf}, projects = {GProM; PUGS}, title = {Efficiently Computing Provenance Graphs for Queries with Negation}, venueshort = {Techreport}, year = {2016}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/LS16a.pdf} }
-
Formal Foundations of Reenactment and Transaction Provenance
Bahareh Arab, Dieter Gawlick, Vasudha Krishnaswamy, Venkatesh Radhakrishnan and Boris Glavic
Technical Report #IIT/CS-DB-2016-01
Illinois Institute of Technology.@techreport{AG16a, author = {Arab, Bahareh and Gawlick, Dieter and Krishnaswamy, Vasudha and Radhakrishnan, Venkatesh and Glavic, Boris}, date-added = {2014-09-17 20:07:29 +0000}, date-modified = {2014-09-17 20:09:08 +0000}, institution = {Illinois Institute of Technology}, keywords = {Provenance; Concurrency Control; Reenactment; GProM}, number = {IIT/CS-DB-2016-01}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG16.pdf}, projects = {GProM; Reenactment}, title = {Formal Foundations of Reenactment and Transaction Provenance}, venueshort = {Techreport}, year = {2016}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG16.pdf} }
2015
-
Computing Candidate Keys Of Relational Operators For Optimizing Rewrite-Based Provenance Computation
Andrea Cornudella
Illinois Institute of Technology.@mastersthesis{A15, author = {Cornudella, Andrea}, keywords = {Provenance; Vagabond; Data Exchange}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/A15.pdf}, projects = {Vagabond}, school = {Illinois Institute of Technology}, title = {{Computing Candidate Keys Of Relational Operators For Optimizing Rewrite-Based Provenance Computation}}, venueshort = {Master Thesis}, year = {2015}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/A15.pdf} }
-
Automatic Generation and Ranking of Explanations for Mapping Errors
Seokki Lee, Zhen Wang, Boris Glavic and Renée J. Miller
Technical Report #IIT/CS-DB-2015-01
Illinois Institute of Technology.@techreport{LW15, author = {Lee, Seokki and Wang, Zhen and Glavic, Boris and Miller, Ren\'{e}e J.}, date-modified = {2015-08-08 08:34:28 +0000}, institution = {Illinois Institute of Technology}, keywords = {Provenance; Vagabond; Data Exchange}, number = {IIT/CS-DB-2015-01}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/LW15.pdf}, projects = {Vagabond}, title = {Automatic Generation and Ranking of Explanations for Mapping Errors}, venueshort = {Techreport}, year = {2015}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/LW15.pdf} }
-
The iBench Integration Metadata Generator
Patricia C. Arocena, Boris Glavic, Radu Ciucanu and Renée J. Miller
University of Toronto.@techreport{AG15, author = {Arocena, Patricia C. and Glavic, Boris and Ciucanu, Radu and Miller, Ren\'{e}e J.}, institution = {University of Toronto}, keywords = {iBench; Data Exchange; Data Integration}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG15.pdf}, projects = {iBench}, title = {The iBench Integration Metadata Generator}, venueshort = {Techreport}, year = {2015}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG15.pdf} }
-
Towards Constraint-based Explanations for Answers and Non-Answers
Boris Glavic, Sven Köhler, Sean Riddle and Bertram Ludäscher
Proceedings of the 7th USENIX Workshop on the Theory and Practice of Provenance (2015).@inproceedings{GK15, author = {Glavic, Boris and K\"{o}hler, Sven and Riddle, Sean and Lud\"{a}scher, Bertram}, booktitle = {Proceedings of the 7th USENIX Workshop on the Theory and Practice of Provenance}, isworkshop = {true}, keywords = {Provenance;Missing Answers;Summarization;Datalog;Game Provenance; PUGS}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GK15.pdf}, projects = {PUGS}, slideurl = {http://www.slideshare.net/lordPretzel/2015-ta-ppwhynotpptx}, title = {Towards Constraint-based Explanations for Answers and Non-Answers}, venueshort = {TaPP}, year = {2015}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GK15.pdf} }
-
Interoperability for Provenance-aware Databases using PROV and JSON
Xing Niu, Raghav Kapoor, Dieter Gawlick, Zhen Hua Liu, Vasudha Krishnaswamy, Venkatesh Radhakrishnan and Boris Glavic
Proceedings of the 7th USENIX Workshop on the Theory and Practice of Provenance (2015).@inproceedings{PJ15, author = {Niu, Xing and Kapoor, Raghav and Gawlick, Dieter and Liu, Zhen Hua and Krishnaswamy, Vasudha and Radhakrishnan, Venkatesh and Glavic, Boris}, booktitle = {Proceedings of the 7th USENIX Workshop on the Theory and Practice of Provenance}, isworkshop = {true}, keywords = {Provenance;JSON;GProM;PROV}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/PJ15.pdf}, projects = {GProM}, slideurl = {http://www.slideshare.net/lordPretzel/2015-tapp}, title = {Interoperability for Provenance-aware Databases using PROV and JSON}, venueshort = {TaPP}, year = {2015}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/PJ15.pdf} }
-
Sharing and Reproducing Database Applications
Quan Pham, Richard Whaling, Boris Glavic and Tanu Malik
Proceedings of the VLDB Endowment (Demonstration Track). 8, 12 (2015) , 1988–1999.@article{PW15, author = {Pham, Quan and Whaling, Richard and Glavic, Boris and Malik, Tanu}, journal = {Proceedings of the VLDB Endowment (Demonstration Track)}, keywords = {Provenance;LDV;Repeatability;Database Virtualization}, number = {12}, pages = {1988 - 1999}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/PW15.pdf}, projects = {LDV}, title = {{Sharing and Reproducing Database Applications}}, venueshort = {PVLDB}, volume = {8}, year = {2015}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/PW15.pdf} }
-
Heuristic and Cost-based Optimization for Provenance Computation
Xing Niu, Raghav Kapoor and Boris Glavic
Proceedings of the 7th USENIX Workshop on the Theory and Practice of Provenance (Poster) (2015).@inproceedings{NK15, author = {Niu, Xing and Kapoor, Raghav and Glavic, Boris}, booktitle = {Proceedings of the 7th USENIX Workshop on the Theory and Practice of Provenance (Poster)}, isworkshop = {true}, keywords = {Provenance; Query Optimization; GProM}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/NK15.pdf}, projects = {GProM}, title = {{Heuristic and Cost-based Optimization for Provenance Computation}}, venueshort = {TaPP}, year = {2015}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/NK15.pdf} }
-
Making Database Applications Shareable
Boris Glavic, Tanu Malik and Quan Pham
Proceedings of the 7th USENIX Workshop on the Theory and Practice of Provenance (Poster) (2015).@inproceedings{GM15, author = {Glavic, Boris and Malik, Tanu and Pham, Quan}, booktitle = {Proceedings of the 7th USENIX Workshop on the Theory and Practice of Provenance (Poster)}, isworkshop = {true}, keywords = {Provenance;LDV;Repeatability;Database Virtualization}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GM15.pdf}, projects = {LDV}, title = {{Making Database Applications Shareable}}, venueshort = {TaPP}, year = {2015}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GM15.pdf} }
-
Error Generation for Evaluating Data Cleaning Algorithms
Patricia C. Arocena, Boris Glavic, Giansalvatore Mecca, Renée J. Miller, Paolo Papotti and Donatello Santoro
Technical Report #TR-01-2015
Università della Basilicata.@techreport{AG15a, author = {Arocena, Patricia C. and Glavic, Boris and Mecca, Giansalvatore and Miller, Ren{\'e}e J. and Papotti, Paolo and Santoro, Donatello}, institution = {Universit{\`a} della Basilicata}, keywords = {BART; Data Cleaning; Benchmarking}, number = {TR-01-2015}, pdfurl = {http://db.unibas.it/projects/bart/files/TR-01-2015.pdf}, projects = {BART}, title = {{Error Generation for Evaluating Data Cleaning Algorithms}}, venueshort = {Techreport}, year = {2015} }
-
Messing Up with Bart: Error Generation for Evaluating Data-Cleaning Algorithms
Patricia C. Arocena, Boris Glavic, Giansalvatore Mecca, Renée J. Miller, Paolo Papotti and Donatello Santoro
Proceedings of the VLDB Endowment. 9, 2 (2015) , 36–47.@article{AG16, author = {Arocena, Patricia C. and Glavic, Boris and Mecca, Giansalvatore and Miller, Ren\'ee J. and Papotti, Paolo and Santoro, Donatello}, journal = {Proceedings of the VLDB Endowment}, keywords = {BART; Data Cleaning; Benchmarking}, number = {2}, pages = {36-47}, pdfurl = {http://db.unibas.it/projects/bart/files/p191-arocena.pdf}, projects = {BART}, slideurl = {http://www.slideshare.net/lordPretzel/2016-vldb-messing-up-with-bart-error-generation-for-evaluating-datacleaning-algorithms}, title = {{Messing Up with Bart: Error Generation for Evaluating Data-Cleaning Algorithms}}, venueshort = {PVLDB}, volume = {9}, year = {2015} }
-
Gain Control over your Integration Evaluations
Patricia C. Arocena, Radu Ciucanu, Boris Glavic and Renée J. Miller
Proceedings of the VLDB Endowment (Demonstration Track). 8, 12 (2015) , 1960–1971.@article{AC15, author = {Arocena, Patricia C. and Ciucanu, Radu and Glavic, Boris and Miller, Ren{\'e}e J.}, journal = {Proceedings of the VLDB Endowment (Demonstration Track)}, keywords = {iBench; Data Exchange; Data Integration; Benchmarking}, number = {12}, pages = {1960 - 1971}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AC15.pdf}, projects = {iBench}, title = {{Gain Control over your Integration Evaluations}}, venueshort = {PVLDB}, volume = {8}, year = {2015} }
-
The iBench Integration Metadata Generator
Patricia C. Arocena, Boris Glavic, Radu Ciucanu and Renée J. Miller
Proceedings of the VLDB Endowment. 9, 3 (2015) , 108–119.@article{AG15c, author = {Arocena, Patricia C. and Glavic, Boris and Ciucanu, Radu and Miller, Ren\'{e}e J.}, journal = {Proceedings of the VLDB Endowment}, keywords = {iBench; Data Exchange; Data Integration; Benchmarking}, number = {3}, pages = {108-119}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG15.pdf}, projects = {iBench}, slideurl = {http://www.slideshare.net/lordPretzel/2016-vldb-the-ibench-integration-metadata-generator}, title = {{The iBench Integration Metadata Generator}}, venueshort = {PVLDB}, volume = {9}, year = {2015} }
-
HRDBMS: A NewSQL Database for Analytics
Jason Arnold, Boris Glavic and Ioan Raicu
Proceedings of the IEEE International Conference on Cluster Computing (Poster) (2015).@inproceedings{AG15b, author = {Arnold, Jason and Glavic, Boris and Raicu, Ioan}, booktitle = {Proceedings of the IEEE International Conference on Cluster Computing (Poster)}, keywords = {Big Data; HRDBMS; Distributed Databases}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG15b.pdf}, projects = {HRDBMS}, title = {HRDBMS: A NewSQL Database for Analytics}, venueshort = {Cluster}, year = {2015} }
-
An Efficient Implementation of Game Provenance in DBMS
Seokki Lee, Yuchen Tang, Sven Köhler, Bertram Ludäscher and Boris Glavic
Technical Report #IIT/CS-DB-2015-02
Illinois Institute of Technology.@techreport{LW15a, author = {Lee, Seokki and Tang, Yuchen and K\"{o}hler, Sven and Lud\"{a}scher, Bertram and Glavic, Boris}, date-modified = {2015-10-22 12:15:28 +0000}, institution = {Illinois Institute of Technology}, keywords = {Provenance; Game Provenance; Datalog; GProM; Missing Answers; PUGS}, number = {IIT/CS-DB-2015-02}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/LW15a.pdf}, projects = {PUGS}, title = {An Efficient Implementation of Game Provenance in DBMS}, venueshort = {Techreport}, year = {2015}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/LW15a.pdf} }
-
LDV: Light-weight Database Virtualization
Quan Pham, Tanu Malik, Boris Glavic and Ian Foster
Proceedings of the 31st IEEE International Conference on Data Engineering (2015), pp. 1179–1190.@inproceedings{PM15, author = {Pham, Quan and Malik, Tanu and Glavic, Boris and Foster, Ian}, booktitle = {Proceedings of the 31st IEEE International Conference on Data Engineering}, keywords = {Provenance;LDV;Repeatability}, pages = {1179-1190}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/PM15.pdf}, projects = {LDV}, slideurl = {http://www.slideshare.net/lordPretzel/icde-ldv}, title = {LDV: Light-weight Database Virtualization}, venueshort = {ICDE}, year = {2015}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/PM15.pdf} }
2014
-
A Generic Provenance Middleware for Database Queries, Updates, and Transactions
Bahareh Arab, Dieter Gawlick, Venkatesh Radhakrishnan, Hao Guo and Boris Glavic
Proceedings of the 6th USENIX Workshop on the Theory and Practice of Provenance (2014).@inproceedings{AG14, author = {Arab, Bahareh and Gawlick, Dieter and Radhakrishnan, Venkatesh and Guo, Hao and Glavic, Boris}, booktitle = {Proceedings of the 6th USENIX Workshop on the Theory and Practice of Provenance}, isworkshop = {true}, keywords = {Reenactment; Provenance; Concurrency Control; GProM}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG14.pdf}, projects = {GProM}, slideurl = {http://www.slideshare.net/lordPretzel/tapp-2014-talk-boris}, title = {A Generic Provenance Middleware for Database Queries, Updates, and Transactions}, venueshort = {TaPP}, year = {2014}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG14.pdf} }
We present an architecture and prototype implementation for a generic provenance database middleware (GProM) that is based on the concept of query rewrites, which are applied to an algebraic graph representation of database operations. The system supports a wide range of provenance types and representations for queries, updates, transactions, and operations spanning multiple transactions. GProM supports several strategies for provenance generation, e.g., on-demand, rule-based, and “always on”. To the best of our knowledge, we are the first to present a solution for computing the provenance of concurrent database transactions. Our solution can retroactively trace transaction provenance as long as an audit log and time travel functionality are available (both are supported by most DBMS). Other noteworthy features of GProM include: extensibility through a declarative rewrite rule specification language, support for multiple database backends, and an optimizer for rewritten queries.
-
Efficient Stream Provenance via Operator Instrumentation
Boris Glavic, Kyumars Sheykh Esmaili, Peter M. Fischer and Nesime Tatbul
Transactions on Internet Technology. 13, 1 (2014) , 7:1–7:26.@article{GE14, author = {Glavic, Boris and Esmaili, Kyumars Sheykh and Fischer, Peter M. and Tatbul, Nesime}, date-added = {2014-05-11 17:49:19 +0000}, date-modified = {2014-05-11 17:55:40 +0000}, journal = {Transactions on Internet Technology}, keywords = {Ariadne; Provenance}, number = {1}, pages = {7:1-7:26}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GE14.pdf}, projects = {Ariadne}, title = {Efficient Stream Provenance via Operator Instrumentation}, venueshort = {TOIT}, volume = {13}, year = {2014}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GE14.pdf} }
Managing fine-grained provenance is a critical requirement for data stream management systems (DSMS), not only to address complex applications that require diagnostic capabilities and assurance, but also for providing advanced functionality such as revision processing or query debugging. This paper introduces a novel approach that uses operator instrumentation, i.e., modifying the behavior of operators, to generate and propagate fine-grained provenance through several operators of a query network. In addition to applying this technique to compute provenance eagerly during query execution, we also study how to decouple provenance computation from query processing to reduce run-time overhead and avoid unnecessary provenance retrieval. Our proposals include computing a concise superset of the provenance (to allow lazily replaying a query and reconstruct its provenance) as well as lazy retrieval (to avoid unnecessary reconstruction of provenance). We develop stream-specific compression methods to reduce the computational and storage overhead of provenance generation and retrieval. Ariadne, our provenance-aware extension of the Borealis DSMS implements these techniques. Our experiments confirm that Ariadne manages provenance with minor overhead and clearly outperforms query rewrite, the current state-of-the-art.
-
Efficient Scoring and Ranking of Explanation for Data Exchange Errors in Vagabond
Zhen Wang
Illinois Institute of Technology.@mastersthesis{W14, author = {Wang, Zhen}, date-added = {2014-05-21 18:55:49 +0000}, date-modified = {2014-05-21 18:55:49 +0000}, keywords = {Provenance; Vagabond; Data Exchange}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/Z14.pdf}, projects = {Vagabond}, school = {Illinois Institute of Technology}, title = {{Efficient Scoring and Ranking of Explanation for Data Exchange Errors in Vagabond}}, venueshort = {Master Thesis}, year = {2014}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/Z14.pdf} }
-
Reenacting Transactions to Compute their Provenance
Bahareh Arab, Dieter Gawlick, Vasudha Krishnaswamy, Venkatesh Radhakrishnan and Boris Glavic
Technical Report #IIT/CS-DB-2014-02
Illinois Institute of Technology.@techreport{AG14a, author = {Arab, Bahareh and Gawlick, Dieter and Krishnaswamy, Vasudha and Radhakrishnan, Venkatesh and Glavic, Boris}, date-added = {2014-09-17 20:07:29 +0000}, date-modified = {2014-09-17 20:09:08 +0000}, institution = {Illinois Institute of Technology}, keywords = {Provenance; Concurrency Control; Reenactment; GProM}, number = {IIT/CS-DB-2014-02}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AD14.pdf}, projects = {GProM; Reenactment}, title = {Reenacting Transactions to Compute their Provenance}, venueshort = {Techreport}, year = {2014}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AD14.pdf} }
-
A Primer on Database Provenance
Boris Glavic
Technical Report #IIT/CS-DB-2014-01
Illinois Institute of Technology.@techreport{G14, author = {Glavic, Boris}, date-added = {2014-09-17 20:02:47 +0000}, date-modified = {2014-09-22 14:12:43 +0000}, institution = {Illinois Institute of Technology}, keywords = {Provenance}, number = {IIT/CS-DB-2014-01}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/G14.pdf}, title = {A Primer on Database Provenance}, venueshort = {Techreport}, year = {2014}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/G14.pdf} }
-
LDV: Light-weight Database Virtualization
Quan Pham, Tanu Malik, Boris Glavic and Ian Foster
Technical Report #IIT/CS-DB-2014-03
Illinois Institute of Technology.@techreport{PM14, author = {Pham, Quan and Malik, Tanu and Glavic, Boris and Foster, Ian}, date-added = {2014-10-08 20:07:29 +0000}, date-modified = {2014-10-08 20:09:08 +0000}, institution = {Illinois Institute of Technology}, keywords = {Provenance;LDV;Repeatability}, number = {IIT/CS-DB-2014-03}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/PM14.pdf}, projects = {LDV}, title = {LDV: Light-weight Database Virtualization}, venueshort = {Techreport}, year = {2014}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/PM14.pdf} }
2013
-
Using SQL for Efficient Generation and Querying of Provenance Information
Boris Glavic, Renée J. Miller and Gustavo Alonso
In search of elegance in the theory and practice of computation: a Festschrift in honour of Peter Buneman. (2013) , 291–320.@article{GM13, author = {Glavic, Boris and Miller, Ren{\'e}e J. and Alonso, Gustavo}, date-added = {2013-07-09 22:54:03 +0000}, date-modified = {2013-08-22 22:56:39 +0000}, journal = {{In search of elegance in the theory and practice of computation: a Festschrift in honour of Peter Buneman}}, keywords = {Perm; Provenance}, pages = {291-320}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GM13.pdf}, projects = {Perm}, title = {Using SQL for Efficient Generation and Querying of Provenance Information}, venueshort = {Festschrift Peter Buneman}, year = {2013}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GM13.pdf} }
In applications such as data warehousing or data exchange, the ability to efficiently generate and query provenance information is crucial to understand the origin of data. In this chapter, we review some of the main contributions of Perm, a DBMS that generates different types of provenance information for complex SQL queries (including nested and correlated subqueries and aggregation). The two key ideas behind Perm are representing data and its provenance together in a single relation and relying on query rewrites to generate this representation. Through this, Perm supports fully integrated, on-demand provenance generation and querying using SQL. Since Perm rewrites a query requesting provenance into a regular SQL query and generates easily optimizable SQL code, its performance greatly benefits from the query optimization techniques provided by the underlying DBMS.
-
iBench First Cut
Patricia C. Arocena, Mariana D’Angelo, Boris Glavic and Renée J. Miller
University of Toronto.@techreport{AD13, author = {Arocena, Patricia C. and D'Angelo, Mariana and Glavic, Boris and Miller, Ren{\'e}e J.}, date-added = {2014-01-08 17:16:53 +0000}, date-modified = {2014-01-08 17:17:42 +0000}, institution = {University of Toronto}, keywords = {iBench; Data Exchange; Data Integration; Benchmarking}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AD13.pdf}, projects = {iBench}, title = {iBench First Cut}, venueshort = {Techreport}, year = {2013}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AD13.pdf} }
-
Ariadne: Managing Fine-Grained Provenance on Data Streams
Boris Glavic, Kyumars Sheykh Esmaili, Peter M. Fischer and Nesime Tatbul
Proceedings of the 7th ACM International Conference on Distributed Event-Based Systems (2013), pp. 291–320.@inproceedings{GE13, author = {Glavic, Boris and Esmaili, Kyumars Sheykh and Fischer, Peter M. and Tatbul, Nesime}, booktitle = {Proceedings of the 7th ACM International Conference on Distributed Event-Based Systems}, date-added = {2013-05-13 14:03:56 +0000}, date-modified = {2013-06-02 19:45:50 +0000}, keywords = {Ariadne; Provenance}, pages = {291-320}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GE13.pdf}, projects = {Ariadne}, slideurl = {http://www.slideshare.net/lordPretzel/2013-debs-Ariadne}, title = {Ariadne: Managing Fine-Grained Provenance on Data Streams}, venueshort = {DEBS}, year = {2013}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GE13.pdf} }
Managing fine-grained provenance is a critical requirement for data stream management systems (DSMS), not only to address complex applications that require diagnostic capabilities and assurance, but also for providing advanced functionality such as revision processing or query debugging. This paper introduces a novel approach that uses operator instrumentation, i.e., modifying the behavior of operators, to generate and propagate fine-grained provenance through several operators of a query network. In addition to applying this technique to compute provenance eagerly during query execution, we also study how to decouple provenance computation from query processing to reduce run-time overhead and avoid unnecessary provenance retrieval. This includes computing a concise superset of the provenance to allow lazily replaying a query network and reconstruct its provenance as well as lazy retrieval to avoid unnecessary reconstruction of provenance. We develop stream-specific compression methods to reduce the computational and storage overhead of provenance generation and retrieval. Ariadne, our provenance-aware extension of the Borealis DSMS implements these techniques. Our experiments confirm that Ariadne manages provenance with minor overhead and clearly outperforms query rewrite, the current state-of-the-art.
-
Provenance Management for Frequent Itemsets
Javed Siddique, Boris Glavic and Renée J. Miller
University of Toronto.@techreport{SG13, author = {Siddique, Javed and Glavic, Boris and Miller, Ren\'{e}e J.}, date-added = {2013-05-13 14:03:56 +0000}, date-modified = {2013-05-13 14:18:04 +0000}, institution = {University of Toronto}, keywords = {Provenance; Data Mining}, pdfurl = {http://dblab.cs.toronto.edu/project/provenance4mining/docs/fimprov_main.pdf}, title = {Provenance Management for Frequent Itemsets}, venueshort = {Techreport}, year = {2013}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/SG13.pdf} }
Provenance has been studied extensively for relational queries and shown to be important in revealing the origin and creation process of data that has been produced by potentially complex relational transformations. Provenance for the results of data mining operators in contrast has not been considered. We argue that provenance offers the same benefits for mining as for relational queries, e.g., it allows us to track errors caused by incorrect input data. We consider the most common mining operator, frequent itemset mining, and introduce two types of provenance (why- and i-provenance) for this operator. We argue that the concept of why-provenance for relational queries can be adapted for frequent itemsets, but that it poses new computational challenges due to the nature of itemset mining and the size of why-provenance. We address these challenges in two ways. First, we propose combining why-provenance computation with SQL querying to permit users to select small and more intuitive representations of the provenance, and second by proposing new compression techniques for the why-provenance. Next, we introduce a new provenance type called i-provenance (itemset provenance) that succinctly represents the interdependencies between items and transactions that explain how a frequent itemset was derived (intuitively giving insight into the structure of the data that provides the evidence for the itemset). We present techniques for efficient storage and use of both types of provenance information and experimentally evaluate the scalability of our approach. We argue through a set of examples that why- and i-provenance can add significant value to mining results and can be used to analyze the context of the transactions that caused an itemset to be frequent and to understand how combinations of itemsets contribute to a result.
-
Value Invention for Data Exchange
Patricia C. Arocena, Boris Glavic and Renée J. Miller
Proceedings of the 39th International Conference on Management of Data (2013), pp. 157–168.@inproceedings{AG13, author = {Arocena, Patricia C. and Glavic, Boris and Miller, Ren\'{e}e J.}, booktitle = {Proceedings of the 39th International Conference on Management of Data}, date-added = {2013-05-13 14:03:56 +0000}, date-modified = {2013-06-02 19:45:38 +0000}, keywords = {Data Exchange; iBench}, pages = {157-168}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG13.pdf}, projects = {iBench}, slideurl = {http://www.slideshare.net/lordPretzel/sigmod-2013-patricias-talk-on-value-invention}, title = {Value Invention for Data Exchange}, venueshort = {SIGMOD}, year = {2013}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AG13.pdf} }
The creation of values to represent incomplete information, often referred to as value invention, is central in data exchange. Within schema mappings, Skolem functions have long been used for value invention as they permit a precise representation of missing information. Recent work on a powerful mapping language called second-order tuple generating dependencies (SO tgds), has drawn attention to the fact that the use of arbitrary Skolem functions can have negative computational and programmatic properties in data exchange. In this paper, we present two techniques for understanding when the Skolem functions needed to represent the correct semantics of incomplete information are computationally well-behaved. Specifically, we consider when the Skolem functions in second-order (SO) mappings have a first-order (FO) semantics and are therefore programmatically and computationally more desirable for use in practice. Our first technique, linearization, significantly extends the Nash, Bernstein and Melnik unskolemization algorithm, by understanding when the sets of arguments of the Skolem functions in a mapping are related by set inclusion. We show that such a linear relationship leads to mappings that have FO semantics and are expressible in popular mapping languages including source-to-target tgds and nested tgds. Our second technique uses source semantics, specifically functional dependencies (including keys), to transform SO mappings into equivalent FO mappings. We show that our algorithms are applicable to a strictly larger class of mappings than previous approaches, but more importantly we present an extensive experimental evaluation that quantifies this difference (about 78% improvement) over an extensive schema mapping benchmark and illustrates the applicability of our results on real mappings.
-
Provenance for Data Mining
Boris Glavic, Javed Siddique, Periklis Andritsos and Renée J. Miller
Proceedings of the 5th USENIX Workshop on the Theory and Practice of Provenance (2013).@inproceedings{GS13, author = {Glavic, Boris and Siddique, Javed and Andritsos, Periklis and Miller, Ren\'{e}e J.}, booktitle = {Proceedings of the 5th USENIX Workshop on the Theory and Practice of Provenance}, date-added = {2013-05-13 14:00:58 +0000}, date-modified = {2013-05-13 14:01:56 +0000}, isworkshop = {true}, keywords = {Provenance}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GS13.pdf}, slideurl = {http://www.slideshare.net/lordPretzel/tapp-2013}, title = {Provenance for Data Mining}, venueshort = {TaPP}, year = {2013}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GS13.pdf} }
Data mining aims at extracting useful information from large datasets. Most data mining approaches reduce the input data to produce a smaller output summarizing the mining result. While the purpose of data mining (extracting information) necessitates this reduction in size, the loss of information it entails can be problematic. Specifically, the results of data mining may be more confusing than insightful, if the user is not able to understand on which input data they are based and how they were created. In this paper, we argue that the user needs access to the provenance of mining results. Provenance, while extensively studied by the database, workflow, and distributed systems communities, has not yet been considered for data mining. We analyze the differences between database, workflow, and data mining provenance, suggest new types of provenance, and identify new use-cases for provenance in data mining. To illustrate our ideas, we present a more detailed discussion of these concepts for two typical data mining algorithms: frequent itemset mining and multi-dimensional scaling.
2012
-
Big Data Provenance: Challenges and Implications for Benchmarking
Boris Glavic
2nd Workshop on Big Data Benchmarking (2012), pp. 72–80.@inproceedings{G13, author = {Glavic, Boris}, booktitle = {2nd Workshop on Big Data Benchmarking}, isworkshop = {true}, keywords = {Big Data; Provenance; Big Provenance}, pages = {72-80}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/G13.pdf}, projects = {Big Provenance}, slideurl = {http://www.slideshare.net/lordPretzel/wbdb-2012-wbdb}, title = {Big Data Provenance: Challenges and Implications for Benchmarking}, venueshort = {WBDB}, year = {2012}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/G13.pdf} }
Data Provenance is information about the origin and creation process of data. Such information is useful for debugging data and transformations, auditing, evaluating the quality of and trust in data, modelling authenticity, and implementing access control for derived data. Provenance has been studied by the database, workflow, and distributed systems communities, but provenance for Big Data - which we refer to as Big Provenance - is a largely unexplored field. This paper reviews existing approaches for large-scale distributed provenance and discusses potential challenges for Big Data benchmarks that aim to incorporate provenance data/management. Furthermore, we will examine how Big Data benchmarking could benefit from different types of provenance information. We argue that provenance can be used for identifying and analyzing performance bottlenecks, to compute performance metrics, and to test a system’s ability to exploit commonalities in data and processing.
-
Ariadne: Managing Fine-Grained Provenance on Data Streams
Boris Glavic, Kyumars Sheykh Esmaili, Peter M. Fischer and Nesime Tatbul
Technical Report #771
ETH Zürich.@techreport{GE12, author = {Glavic, Boris and Esmaili, Kyumars Sheykh and Fischer, Peter M. and Tatbul, Nesime}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-18 17:16:08 +0000}, institution = {ETH Z{\"u}rich}, keywords = {Ariadne; Provenance}, number = {771}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GE12.pdf}, projects = {Ariadne}, title = {Ariadne: Managing Fine-Grained Provenance on Data Streams}, venueshort = {Techreport}, year = {2012}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GE12.pdf} }
2011
-
The Case for Fine-Grained Stream Provenance
Boris Glavic, Kyumars Sheykh Esmaili, Peter M. Fischer and Nesime Tatbul
Proceedings of the 1st Workshop on Data Streams and Event Processing collocated with BTW (2011), pp. 58–61.@inproceedings{GE11, author = {Glavic, Boris and Esmaili, Kyumars Sheykh and Fischer, Peter M. and Tatbul, Nesime}, bibsource = {DBLP, http://dblp.uni-trier.de}, booktitle = {Proceedings of the 1st Workshop on Data Streams and Event Processing collocated with BTW}, crossref = {DBLP:conf/btw/2011w}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-18 17:16:17 +0000}, isworkshop = {true}, keywords = {Ariadne; Provenance}, pages = {58-61}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GE11.pdf}, projects = {Ariadne}, title = {{The Case for Fine-Grained Stream Provenance}}, venueshort = {DSEP}, year = {2011}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GE11.pdf} }
The current state of the art for provenance in data stream management systems (DSMS) is to provide provenance at a high level of abstraction (such as, from which sensors in a sensor network an aggregated value is derived from). This limitation was imposed by high-throughput requirements and an anticipated lack of application demand for more detailed provenance information. In this work, we first demonstrate by means of well-chosen use cases that this is a misconception, i.e., coarse-grained provenance is in fact insufficient for many application domains. We then analyze the requirements and challenges involved in integrating support for fine-grained provenance into a streaming system and outline a scalable solution for supporting tuple-level provenance in DSMS.
-
Smile: Enabling Easy and Fast Development of Domain-Specific Scheduling Protocols
Christian Tilgner, Boris Glavic, Michael H. Böhlen and Carl-Christian Kanne
Proceedings of the 28th British National Conference on Databases (2011), pp. 128–131.@inproceedings{TG11a, author = {Tilgner, Christian and Glavic, Boris and B\"ohlen, Michael H. and Kanne, Carl-Christian}, bibsource = {DBLP, http://dblp.uni-trier.de}, booktitle = {Proceedings of the 28th British National Conference on Databases}, crossref = {DBLP:conf/bncod/2011}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-18 17:16:36 +0000}, ee = {http://dx.doi.org/10.1007/978-3-642-24577-0_13}, keywords = {Oshiya}, pages = {128-131}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/TG11a.pdf}, projects = {Oshiya}, title = {{Smile: Enabling Easy and Fast Development of Domain-Specific Scheduling Protocols}}, venueshort = {BNCOD}, year = {2011}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/TG11a.pdf} }
Modern server systems schedule large amounts of concurrent requests constrained by, e.g., correctness criteria and service-level agreements. Since standard database management systems provide only limited consistency levels, the state of the art is to develop schedulers imperatively which is time-consuming and error-prone. In this poster, we present Smile (declarative Scheduling MIddLEware), a tool for developing domain-specific scheduling protocols declaratively. Smile decreases the effort to implement and adapt such protocols because it abstracts from low level scheduling details allowing developers to focus on the protocol implementation. We demonstrate the advantages of our approach by implementing a domain-specific use case protocol.
-
Reexamining Some Holy Grails of Data Provenance
Boris Glavic and Renée J. Miller
Proceedings of the 3rd USENIX Workshop on the Theory and Practice of Provenance (2011).@inproceedings{GM11, author = {Glavic, Boris and Miller, Ren\'{e}e J.}, booktitle = {Proceedings of the 3rd USENIX Workshop on the Theory and Practice of Provenance}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-18 17:17:10 +0000}, isworkshop = {true}, keywords = {Provenance}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GM11.pdf}, slideurl = {http://www.slideshare.net/lordPretzel/tapp-2014-talk-boris-41165865}, title = {{Reexamining Some Holy Grails of Data Provenance}}, venueshort = {TaPP}, year = {2011}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GM11.pdf} }
We reconsider some of the explicit and implicit properties that underlie well-established definitions of data provenance semantics. Previous work on comparing provenance semantics has mostly focused on expressive power (does the provenance generated by a certain semantics subsume the provenance generated by other semantics) and on understanding whether a semantics is insensitive to query rewrite (i.e., do equivalent queries have the same provenance). In contrast, we try to investigate why certain semantics possess specific properties (like insensitivity) and whether these properties are always desirable. We present a new property stability with respect to query language extension that, to the best of our knowledge, has not been isolated and studied on its own.
-
Declarative Serializable Snapshot Isolation
Christian Tilgner, Boris Glavic, Michael H. Böhlen and Carl-Christian Kanne
Proceedings of the 15th International Conference on Advances in Database and Information Systems (2011), pp. 170–184.@inproceedings{TG11, author = {Tilgner, Christian and Glavic, Boris and B\"ohlen, Michael H. and Kanne, Carl-Christian}, bibsource = {DBLP, http://dblp.uni-trier.de}, booktitle = {Proceedings of the 15th International Conference on Advances in Database and Information Systems}, crossref = {DBLP:conf/adbis/2011}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-18 17:17:23 +0000}, ee = {http://dx.doi.org/10.1007/978-3-642-23737-9_13}, keywords = {Oshiya}, pages = {170-184}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/TG11.pdf}, projects = {Oshiya}, title = {{Declarative Serializable Snapshot Isolation}}, venueshort = {ADBIS}, year = {2011}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/TG11.pdf} }
Snapshot isolation (SI) is a popular concurrency control protocol, but it permits non-serializable schedules that violate database integrity. The Serializable Snapshot Isolation (SSI) protocol ensures (view) serializability by preventing pivot structures in SI schedules. In this paper, we leverage the SSI approach and develop the Declarative Serializable Snapshot Isolation (DSSI) protocol, an SI protocol that guarantees serializable schedules. Our approach requires no analysis of application programs or changes to the underlying DBMS. We present an implementation and prove that it ensures serializability.
-
Debugging Data Exchange with Vagabond
Boris Glavic, Jiang Du, Renée J. Miller, Gustavo Alonso and Laura M. Haas
Proceedings of the VLDB Endowment (Demonstration Track). 4, 12 (2011) , 1383–1386.@article{GD11, author = {Glavic, Boris and Du, Jiang and Miller, Ren\'{e}e J. and Alonso, Gustavo and Haas, Laura M.}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-18 17:17:34 +0000}, journal = {Proceedings of the VLDB Endowment (Demonstration Track)}, keywords = {Provenance; Vagabond; Data Exchange}, number = {12}, pages = {1383-1386}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GD11.pdf}, projects = {Vagabond}, title = {{Debugging Data Exchange with Vagabond}}, venueshort = {PVLDB}, volume = {4}, year = {2011}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GD11.pdf} }
In this paper, we present Vagabond, a system that uses a novel holistic approach to help users to understand and debug data exchange scenarios. Developing such a scenario is a complex and labor-intensive process where errors are often only revealed in the target instance produced as the result of this process. This makes it very hard to debug such scenarios, especially for non-power users. Vagabond aides a user in debugging by automatically generating possible explanations for target instance errors identified by the user.
2010
-
TRAMP: Understanding the Behavior of Schema Mappings through Provenance
Boris Glavic, Gustavo Alonso, Renée J. Miller and Laura M. Haas
Proceedings of the Very Large Data Bases Endowment. 3, 1 (2010) , 1314–1325.@article{GA10, author = {Glavic, Boris and Alonso, Gustavo and Miller, Ren\'{e}e J. and Haas, Laura M.}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-18 17:17:43 +0000}, journal = {Proceedings of the Very Large Data Bases Endowment}, keywords = {TRAMP; Provenance; Data Exchange}, number = {1}, pages = {1314-1325}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GA10.pdf}, projects = {TRAMP}, slideurl = {http://www.slideshare.net/lordPretzel/2010-vldb-tramp}, title = {{TRAMP: Understanding the Behavior of Schema Mappings through Provenance}}, venueshort = {PVLDB}, volume = {3}, year = {2010}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GA10.pdf} }
Though partially automated, developing schema mappings remains a complex and potentially error-prone task. In this paper, we present TRAMP (TRAnsformation Mapping Provenance), an extensive suite of tools supporting the debugging and tracing of schema mappings and transformation queries. TRAMP combines and extends data provenance with two novel notions, transformation provenance and mapping provenance, to explain the relationship between transformed data and those transformations and mappings that produced that data. In addition we provide query support for transformations, data, and all forms of provenance. We formally define transformation and mapping provenance, present an efficient implementation of both forms of provenance, and evaluate the resulting system through extensive experiments.
-
Perm: Efficient Provenance Support for Relational Databases
Boris Glavic
University of Zurich.@phdthesis{G10a, author = {Glavic, Boris}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-14 18:55:49 +0000}, keywords = {Provenance; Perm}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/G10a.pdf}, projects = {Perm}, school = {University of Zurich}, title = {{Perm: Efficient Provenance Support for Relational Databases}}, venueshort = {PhD Thesis}, year = {2010}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/G10a.pdf} }
In many application areas like scientific computing, data-warehousing, and data integration detailed information about the origin of data is required. This kind of information is often referred to as data provenance. The provenance of a piece of data, a so-called data item, includes information about the source data from which it is derived and the transformations that lead to its creation and current representation. In the context of relational databases, provenance has been studied both from a theoretical and algorithmic perspective. Yet, in spite of the advances made, there are very few practical systems available that support generating, querying and storing provenance information (We refer to such systems as provenance management systems or PMS). These systems support only a subset of SQL, a severe limitation in practice since most of the application domains that benefit from provenance information use complex queries. Such queries typically involve nested sub-queries, aggregation and/or user defined functions. Without support for these constructs, a provenance management system is of limited use. Furthermore, existing approaches use different data models to represent provenance and the data for which provenance is computed (normal data). This has the intrinsic disadvantage that a new query language has to be developed for querying provenance information. Naturally, such a query language is not as powerful and mature as, e.g., SQL. In this thesis we present Perm, a novel relational provenance management system that addresses the shortcoming of existing approaches discussed above. The underlying idea of Perm is to represent provenance information as standard relations and to generate and query it using standard SQL queries; ”Use SQL to compute and query the provenance of SQL queries”. Perm is implemented on top of PostgreSQL extending its SQL dialect with provenance features that are implemented as query rewrites. This approach enables the system to take full benefit from the advanced query optimizer of PostgreSQL and provide full SQL query support for provenance information. Several important steps were necessary to realize our vision of a ”purely relational” provenance management system that is capable of generating provenance information for complex SQL queries. We developed new notions of provenance that handle SQL constructs not covered by the standard definitions of provenance. Based on these provenance definitions rewrite rules for relational algebra expressions are defined for transforming an algebra expression q into an algebra expression that computes the provenance of q (These rewrites rules are proven to produce correct and complete results). The implementation of Perm, based on this solid theoretical foundation, applies a variety of novel optimization techniques that reduce the cost of some intrinsically expensive provenance operations. By applying the Perm system to schema mapping debugging - a prominent use case for provenance - and extensive performance measurements we confirm the feasibility of our approach and the superiority of Perm over alternative approaches.
-
Formal Foundation of Contribution Semantics and Provenance Computation through Query Rewrite in TRAMP
Boris Glavic
University of Zurich.@techreport{G10, author = {Glavic, Boris}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-18 17:10:21 +0000}, institution = {University of Zurich}, keywords = {TRAMP; Provenance}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/G10.pdf}, projects = {TRAMP}, title = {{Formal Foundation of Contribution Semantics and Provenance Computation through Query Rewrite in TRAMP}}, venueshort = {Techreport}, year = {2010}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/G10.pdf} }
-
Data lineage/provenance in XQuery
Donald Kossmann, Peter M. Fischer, Kyumars Sheykh Esmaili, Boris Glavic and Beat Steiger
ETH Zurich.@mastersthesis{KF10, author = {Kossmann, Donald and Fischer, Peter M. and Esmaili, Kyumars Sheykh and Glavic, Boris and Steiger, Beat}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-14 18:55:49 +0000}, keywords = {Provenance}, school = {ETH Zurich}, title = {{Data lineage/provenance in XQuery}}, venueshort = {Master Thesis}, year = {2010} }
-
Correctness Proof of the Declarative SS2PL Protocol Implementation
Christian Tilgner, Boris Glavic, Michael H. Böhlen and Carl-Christian Kanne
University of Zurich.@techreport{TG10, author = {Tilgner, Christian and Glavic, Boris and B\"ohlen, Michael H. and Kanne, Carl-Christian}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-18 17:16:01 +0000}, group = {dbtg}, institution = {University of Zurich}, keywords = {Oshiya}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/TG10.pdf}, projects = {Oshiya}, title = {{Correctness Proof of the Declarative SS2PL Protocol Implementation}}, venueshort = {Techreport}, year = {2010}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/TG10.pdf} }
2009
-
The Perm Provenance Management System in Action
Boris Glavic and Gustavo Alonso
Proceedings of the 35th ACM SIGMOD International Conference on Management of Data (Demonstration Track) (2009), pp. 1055–1058.@inproceedings{GA09b, author = {Glavic, Boris and Alonso, Gustavo}, booktitle = {Proceedings of the 35th ACM SIGMOD International Conference on Management of Data (Demonstration Track)}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-14 18:55:49 +0000}, keywords = {Provenance; Perm}, pages = {1055-1058}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GA09b.pdf}, projects = {Perm}, title = {{The Perm Provenance Management System in Action}}, venueshort = {SIGMOD}, year = {2009}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GA09b.pdf} }
In this demonstration we present the Perm provenance management system (PMS). Perm is capable of computing, storing and querying provenance information for the relational data model. Provenance is computed by using query rewriting techniques to annotate tuples with provenance information. Thus, provenance data and provenance computations are represented as relational data and queries and, hence, can be queried, stored and optimized using standard relational database techniques. This demo shows the complete Perm system and lets attendants examine in detail the process of query rewriting and provenance retrieval in Perm, the most complete data provenance system available today. For example, Perm supports lazy and eager provenance computation, external provenance and various contribution semantics.
-
Provenance for Nested Subqueries
Boris Glavic and Gustavo Alonso
Proceedings of the 12th International Conference on Extending Database Technology (2009), pp. 982–993.@inproceedings{GA09a, author = {Glavic, Boris and Alonso, Gustavo}, booktitle = {Proceedings of the 12th International Conference on Extending Database Technology}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-14 18:55:49 +0000}, keywords = {Provenance; Perm}, pages = {982-993}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GA09a.pdf}, projects = {Perm}, slideurl = {http://www.slideshare.net/lordPretzel/edbt-2009-provenance-for-nested-subqueries}, title = {{Provenance for Nested Subqueries}}, venueshort = {EDBT}, year = {2009}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GA09a.pdf} }
Data provenance is essential in applications such as scientific computing, curated databases, and data warehouses. Several systems have been developed that provide provenance functionality for the relational data model. These systems support only a subset of SQL, a severe limitation in practice since most of the application domains that benefit from provenance information use complex queries. Such queries typically involve nested subqueries, aggregation and/or user defined functions. Without support for these constructs, a provenance management system is of limited use. In this paper we address this limitation by exploring the problem of provenance derivation when complex queries are involved. More precisely, we demonstrate that the widely used definition of Why-provenance fails in the presence of nested subqueries, and show how the definition can be modified to produce meaningful results for nested subqueries. We further present query rewrite rules to transform an SQL query into a query propagating provenance. The solution introduced in this paper allows us to track provenance information for a far wider subset of SQL than any of the existing approaches. We have incorporated these ideas into the Perm provenance management system engine and used it to evaluate the feasibility and performance of our approach.
-
Perm: Processing Provenance and Data on the same Data Model through Query Rewriting
Boris Glavic and Gustavo Alonso
Proceedings of the 25th IEEE International Conference on Data Engineering (2009), pp. 174–185.@inproceedings{GA09, author = {Glavic, Boris and Alonso, Gustavo}, booktitle = {Proceedings of the 25th IEEE International Conference on Data Engineering}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-14 18:55:49 +0000}, keywords = {Provenance; Perm}, pages = {174-185}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GA09.pdf}, projects = {Perm}, slideurl = {http://www.slideshare.net/lordPretzel/icde-2009}, title = {{Perm: Processing Provenance and Data on the same Data Model through Query Rewriting}}, venueshort = {ICDE}, year = {2009}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GA09.pdf} }
Data provenance is information that describes how a given data item was produced. The provenance includes source and intermediate data as well as the transformations involved in producing the concrete data item. In the context of a relational databases, the source and intermediate data items are relations, tuples and attribute values. The transformations are SQL queries and/or functions on the relational data items. Existing approaches capture provenance information by extending the underlying data model. This has the intrinsic disadvantage that the provenance must be stored and accessed using a different model than the actual data. In this paper, we present an alternative approach that uses query rewriting to annotate result tuples with provenance information. The rewritten query and its result use the same model and can, thus, be queried, stored and optimized using standard relational database techniques. In the paper we formalize the query rewriting procedures, prove their correctness, and evaluate a first implementation of the ideas using PostgreSQL. As the experiments indicate, our approach efficiently provides provenance information inducing only a small overhead on normal operations.
2008
-
Clustering Multidimensional Sequences in Spatial and Temporal Databases
Ira Assent, Ralph Krieger, Boris Glavic and Thomas Seidl
International Journal on Knowledge and Information Systems. 16, 1 (2008) , 29–51.@article{AK08, author = {Assent, Ira and Krieger, Ralph and Glavic, Boris and Seidl, Thomas}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-14 18:55:49 +0000}, journal = {International Journal on Knowledge and Information Systems}, keywords = {Data Mining}, number = {1}, pages = {29-51}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AK08.pdf}, title = {{Clustering Multidimensional Sequences in Spatial and Temporal Databases}}, venueshort = {KAIS}, volume = {16}, year = {2008}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AK08.pdf} }
Many environmental, scientific, technical or medical database applications require effective and efficient mining of time series, sequences or trajectories of measurements taken at different time points and positions forming large temporal or spatial databases. Particularly the analysis of concurrent and multidimensional sequences poses new challenges in finding clusters of arbitrary length and varying number of attributes. We present a novel algorithm capable of finding parallel clusters in different subspaces and demonstrate our results for temporal and spatial applications. Our analysis of structural quality parameters in rivers is successfully used by hydrologists to develop measures for river quality improvements.
2007
-
Data Provenance: A Categorization of Existing Approaches
Boris Glavic and Klaus R. Dittrich
Proceedings of the 12th GI Conference on Datenbanksysteme in Buisness, Technologie und Web (2007), pp. 227–241.@inproceedings{GD07, author = {Glavic, Boris and Dittrich, Klaus R.}, bibsource = {DBLP, http://dblp.uni-trier.de}, booktitle = {Proceedings of the 12th GI Conference on Datenbanksysteme in Buisness, Technologie und Web}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-14 18:55:49 +0000}, keywords = {Provenance}, local-url = {file://localhost/Users/admin/Documents/Uni/IFI/Papers/GD07_Data%20Provenance%20A%20Categorization%20of%20Existing%20Approaches_0.pdf}, pages = {227-241}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GD07.pdf}, projects = {Perm}, title = {{Data Provenance: A Categorization of Existing Approaches}}, venueshort = {BTW}, year = {2007}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GD07.pdf} }
In many application areas like e-science and data-warehousing detailed information about the origin of data is required. This kind of information is often referred to as data provenance or data lineage. The provenance of a data item includes information about the processes and source data items that lead to its creation and current representation. The diversity of data representation models and application domains has lead to a number of more or less formal definitions of provenance. Most of them are limited to a special application domain, data representation model or data processing facility. Not surprisingly, the associated implementations are also restricted to some application domain and depend on a special data model. In this paper we give a survey of data provenance models and prototypes, present a general categorization scheme for provenance models and use this categorization scheme to study the properties of the existing approaches. This categorization enables us to distinguish between different kinds of provenance information and could lead to a better understanding of provenance in general. Besides the categorization of provenance types, it is important to include the storage, transformation and query requirements for the different kinds of provenance information and application domains in our considerations. The analysis of existing approaches will assist us in revealing open research problems in the area of data provenance.
2006
-
Spatial Multidimensional Sequence Clustering
Ira Assent, Ralph Krieger, Boris Glavic and Thomas Seidl
Proceedings of the 1st International Workshop on Spatial and Spatio-temporal Data Mining collocated with ICDM (2006), pp. 343–348.@inproceedings{AK06, author = {Assent, Ira and Krieger, Ralph and Glavic, Boris and Seidl, Thomas}, bibsource = {DBLP, http://dblp.uni-trier.de}, booktitle = {Proceedings of the 1st International Workshop on Spatial and Spatio-temporal Data Mining collocated with ICDM}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-14 18:55:49 +0000}, ee = {http://doi.ieeecomputersociety.org/10.1109/ICDMW.2006.153}, keywords = {Data Mining}, local-url = {file://localhost/Users/admin/Documents/Uni/IFI/Papers/AKGS06_Spatial%20Multidimensional%20Sequence%20Clustering_0.pdf}, pages = {343-348}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AK06.pdf}, title = {{Spatial Multidimensional Sequence Clustering}}, venueshort = {SSTDM}, year = {2006}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/AK06.pdf} }
Measurements at different time points and positions in large temporal or spatial databases requires effective and efficient data mining techniques. For several parallel measurements, finding clusters of arbitrary length and number of attributes, poses additional challenges. We present a novel algorithm capable of finding parallel clusters in different structural quality parameter values for river sequences used by hydrologists to develop measures for river quality improvements.
-
sesam: Ensuring Privacy for an Interdisciplinary Longitudinal Study
Boris Glavic and Klaus R. Dittrich
Proceedings of the 1st Workshop Elektronische Datentreuhänderschaft - Anwendungen, Verfahren, Grundlagen collocated with GI Jahrestagung (2006), pp. 736–743.@inproceedings{GD06, author = {Glavic, Boris and Dittrich, Klaus R.}, booktitle = {Proceedings of the 1st Workshop Elektronische Datentreuh\"anderschaft - Anwendungen, Verfahren, Grundlagen collocated with GI Jahrestagung}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-20 23:31:32 +0000}, isworkshop = {true}, keywords = {sesam}, pages = {736-743}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GD06.pdf}, title = {sesam: Ensuring Privacy for an Interdisciplinary Longitudinal Study}, venueshort = {GI Jahrestagung}, year = {2006}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/GD06.pdf} }
Most medical, biological and social studies face the problem of storing information about subjects for research purposes without violating the subject’s privacy. In most cases it is not possible to remove all information that could be linked to a subject, because some of this information is needed for the research itself. This fact holds especially for longitudinal studies, which collect data about a subject at different times and places. Longitudinal studies need to link different data about a specific subject, collected at different times for research and administration use. In this paper we present the security concept proposed for sesam, a longitudinal interdisciplinary study that analyses the social, biological and psychological risk factors for the development of psychological diseases. Our security concept is based on pseudonymisation, encrypted data transfer and an electronic data custodianship. This paper is mainly a case study and some of the security problems emerged in the context of sesam may not occur in other studies. Nevertheless we believe that an adopted version of our approach could be used in other application scenarios as well.
2005
-
Subspace Sequence Clustering - Datamining zur Entscheidungsunterstützung in der Hydrologie
Boris Glavic
Proceedings of the 11th GI Conference on Database Systems for Business, Technology, and Web (Student Track) (2005), pp. 15–17.@inproceedings{G05, author = {Glavic, Boris}, bibsource = {DBLP, http://dblp.uni-trier.de}, booktitle = {Proceedings of the 11th GI Conference on Database Systems for Business, Technology, and Web (Student Track)}, date-added = {2012-12-14 18:55:49 +0000}, date-modified = {2012-12-14 18:55:49 +0000}, keywords = {Data Mining}, pages = {15-17}, pdfurl = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/G05.pdf}, title = {{Subspace Sequence Clustering - Datamining zur Entscheidungsunterst{\"u}tzung in der Hydrologie}}, venueshort = {BTW}, year = {2005}, bdsk-url-1 = {http://cs.iit.edu/%7edbgroup/assets/pdfpubls/G05.pdf} }