@inproceedings{tang2024dayu, author = {Tang, Meng and Cernuda, Jaime and Ye, Jie and Guo, Luanzheng and Tallent, Nathan R. and Kougkas, Anthony and Sun, Xian-He}, title = {DaYu: Optimizing Distributed Scientific Workflows by Decoding Dataflow Semantics and Dynamics}, year = {2024}, publisher = {Institute of Electrical and Electronics Engineers}, address = {Piscataway, NJ, USA}, booktitle = {Proceedings of the 2024 IEEE International Conference on Cluster Computing (CLUSTER)}, location = {Kobe, Japan}, series = {CLUSTER '24}, isbn = {979-8-3503-5871-1}, issn = {2168-9253}, pages = {357--369}, doi = {10.1109/CLUSTER59578.2024.00038}, organization = {IEEE}, price = {$31.00}, copyright = {©2024 IEEE}, abstract = {The combination of ever-growing scientific datasets and distributed workflow complexity creates I/O performance bottlenecks due to data volume, velocity, and variety. Although the increasing use of descriptive data formats (e.g., HDF5, netCDF) helps organize these datasets, it also introduces obscure bottlenecks due to the need to translate high-level operations into file addresses and then into low-level I/O operations. To address this challenge, we introduce DaYu, a method and toolset for analyzing (a) semantic relationships between logical datasets and file addresses, (b) how dataset operations translate into I/O, and (c) the combination across entire workflows. DaYu’s analysis and visualization enable the identification of critical bottlenecks and the reasoning about remediation. We describe our methodology and propose optimization guidelines. Evaluation on scientific workflows demonstrates up to a 3.7x performance improvement in I/O time for obscure bottlenecks. The time and storage overhead for DaYu’s time-ordered data are typically under 0.2\% of runtime and 0.25\% of data volume, respectively.}, keywords = {distributed workflows, dataflow semantics, workflow optimization, scientific workflows, performance analysis}, numpages = {13}, month = {September}, }