Abstract
Cloud operators utilize collective communication optimizers to enhance the efficiency of the single-tenant, centrally managed training clusters they manage. However, current optimizers struggle to scale for such use cases and often compromise solution quality for scalability. Our solution, TE-CCL, adopts a traffic-engineering-based approach to collective communication. Compared to a state-of-the-art optimizer, TACCL, TE-CCL produced schedules with $2\times$ better performance on topologies TACCL supports (and took the same amount of solver time to do so). TE-CCL additionally scales to larger topologies than TACCL. On our GPU testbed, TE-CCL outperformed TACCL by $2.14\times$ and RCCL by $3.18\times$ in terms of algorithm bandwidth.
BibTeX Citation
@inproceedings{10.1145/3651890.3672249,
author = {Liu, Xuting and
Arzani, Behnaz and
Kakarla, Siva Kesava Reddy and
Zhao, Liangyu and
Liu, Vincent and
Castro, Miguel and
Kandula, Srikanth and
Marshall, Luke},
title = {Rethinking Machine Learning Collective Communication as a Multi-Commodity Flow Problem},
year = {2024},
isbn = {9798400706141},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3651890.3672249},
doi = {10.1145/3651890.3672249},
pages = {16–37},
numpages = {22},
keywords = {GPU, collective communication, traffic engineering},
location = {Sydney, NSW, Australia},
series = {ACM SIGCOMM '24}
}