Abstract
We discuss the potential improvement large language models (LLM) can provide in incident management and how they can overhaul the ways operators conduct incident management today. We propose a holistic framework for building an AI helper for incident management and discuss the several avenues of future research needed to achieve it.
We thoroughly analyze the fundamental requirements the community should consider when designing such helpers. Our work is based on discussions with operators of a large public cloud provider and their prior experiences both in incident management and with attempts to improve the incident management experience through various forms of automation.
BibTeX Citation
@inproceedings{10.1145/3626111.3628176,
author = {Hamadanian, Pouya and
Arzani, Behnaz and
Fouladi, Sadjad and
Kakarla, Siva Kesava Reddy and
Fonseca, Rodrigo and
Billor, Denizcan and
Cheema, Ahmad and
Nkposong, Edet and
Chandra, Ranveer},
title = {A Holistic View of AI-Driven Network Incident Management},
year = {2023},
isbn = {9798400704154},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3626111.3628176},
doi = {10.1145/3626111.3628176},
booktitle = {Proceedings of the 22nd ACM Workshop on Hot Topics in Networks},
pages = {180–188},
numpages = {9},
keywords = {Incident Management, Large Language Models},
location = {Cambridge, MA, USA},
series = {HotNets '23}
}