BibTeX
@INPROCEEDINGS{
Ranjan2025AHB,
booktitle = "Proceedings of the 2025 IEEE 39th International Parallel and Distributed
Processing Symposium Workshops (IPDPSW 2025), June~3--7, 2025, Milan, Italy",
address = "Los Alamitos, CA, USA",
publisher = "IEEE Computer Society",
author = "Ranjan, Desh and Zubair, Mohammad",
title = "A Header-Based {C++} Library for Computing {H}essian on {GPU} using Automatic
Differentiation",
year = "2025",
pages = "355--364",
keywords = "Automatic Differentiation; Hessian Computation; Hessian-vector product computation;
GPU Computing",
doi = "10.1109/IPDPSW66978.2025.00061",
abstract = "The Hessian-vector product computation appears in many scientific applications such
as in optimization and finite element modeling. Often there is a need for computing Hessian-vector
products at many data points concurrently. We propose an automatic differentiation (AD) based
method, CHESSFAD (Chunked HESSian using Forward-mode AD), that is designed with efficient parallel
computation of Hessian and Hessian-Vector products in mind. CHESSFAD computes second-order
derivatives using forward mode and exposes parallelism at different levels that can be exploited on
accelerators such as NVIDIA GPUs. In CHESSFAD approach, the computation of a row of the Hessian
matrix is independent of the computation of other rows. Hence rows of the Hessian matrix can be
computed concurrently. The second level of parallelism is exposed because CHESSFAD approach
partitions the computation of a Hessian row into chunks, where different chunks can be computed
concurrently. CHESSFAD is implemented as a lightweight header-based C++ library that works both for
CPUs and GPUs. We evaluate the performance of CHESSFAD for performing a large number of independent
Hessian-Vector products on a set of standard test functions, and compare its performance to other
existing header-based C++ libraries such as autodiff. Our results show that CHESSFAD performs better
than autodiff, on all these functions with improvement ranging from 5-50\% on average. We also
analyze its efficiency on GPUs as the number of variables in the function grows. We demonstrate that
our approach is easily parallelizable and enables us to work with Hessian of a function of a large
number of variables, which was not possible in sequential implementation. For example, the
sequential execution time required for the Hessian-vector product for two variables is approximately
enough to compute the Hessian-vector product for 16 variables on GPU for all three functions. A
basic analysis of the number of arithmetic operations needed for computing the Hessian using the
CHESSFAD approach is also provided.",
ad_theotech = "Forward Mode, Hessian"
}
|