@inproceedings{f0c382ff18f141af88f4ff282a4c9d84,
title = "Selective Preemption of Distributed Deep Learning Training",
abstract = "As more distributed deep learning (DDL) jobs run in public clouds, their effective scheduling becomes a major challenge. Current studies prioritize the execution of jobs with less remaining time, which is known to be the best in reducing average job completion time (JCT). However, we observe that this approach does not work when the preemption for pausing and loading jobs weighs in; sometimes, the preemption overheads of DDL jobs take up to hundreds of seconds. This results in very ineffective scheduling, so in some cases, the first-in-first-out policy performs much better. This paper proposes a new scheduling framework called Xion that takes into account the preemption overheads and only preempts DDL jobs when it is beneficial. Our evaluation results demonstrate that Xion effectively reduces the average JCT by 19\% and improves the waiting time by 1.64×.",
keywords = "Distributed deep learning, GPU cloud, GPU scheduling, Job scheduling, Preemption, SRTF",
author = "Younghun Go and Changyong Shin and Jeunghwan Lee and Yeonho Yoo and Gyeongsik Yang and Chuck Yoo",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 16th IEEE International Conference on Cloud Computing, CLOUD 2023 ; Conference date: 02-07-2023 Through 08-07-2023",
year = "2023",
doi = "10.1109/CLOUD60044.2023.00028",
language = "English",
series = "IEEE International Conference on Cloud Computing, CLOUD",
publisher = "IEEE Computer Society",
pages = "175--177",
editor = "Claudio Ardagna and Nimanthi Atukorala and Pete Beckman and Chang, \{Carl K.\} and Chang, \{Rong N.\} and Constantinos Evangelinos and Jing Fan and Fox, \{Geoffrey C.\} and Judy Fox and Christoph Hagleitner and Zhi Jin and Tevfik Kosar and Manish Parashar",
booktitle = "Proceedings - 2023 IEEE 16th International Conference on Cloud Computing, CLOUD 2023",
address = "United States",
}