2015
Sewell, Christopher; Lo, Li-Ta; Heitmann, Katrin; Habib, Salman; Ahrens, James
Utilizing Many-Core Accelerators for Halo and Center Finding within a Cosmology Simulation Proceedings Article
In: Proceedings of the IEEE Symposium on Large Data Analysis and Visualization, IEEE Press, Chicago, Illinois, 2015, (LA-UR-15-22202).
Abstract | Links | BibTeX | Tags: cosmology, halo finding, many-core, Programming Techniques
@inproceedings{Sewell:2015a,
title = {Utilizing Many-Core Accelerators for Halo and Center Finding within a Cosmology Simulation},
author = {Christopher Sewell and Li-Ta Lo and Katrin Heitmann and Salman Habib and James Ahrens},
url = {http://datascience.dsscale.org/wp-content/uploads/2016/06/UtilizingMany-CoreAcceleratorsForHaloAndCenterFindingWithinACosmologySimulation.pdf},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the IEEE Symposium on Large Data Analysis and Visualization},
publisher = {IEEE Press},
address = {Chicago, Illinois},
series = {LDAV '15},
abstract = {Efficiently finding and computing statistics about “halos” (regions of high density) are essential analysis steps for N-body cosmology simulations. However, in state-of-the-art simulation codes, these analysis operators do not currently take advantage of the shared- memory data-parallelism available on multi-core and many-core ar- chitectures. The Hybrid / Hardware Accelerated Cosmology Code (HACC) is designed as an MPI+X code, but the analysis operators are parallelized only among MPI ranks, because of the difficulty in porting different X implementations (e.g., OpenMP, CUDA) across all architectures on which it is run. In this paper, we present portable data-parallel algorithms for several variations of halo find- ing and halo center finding algorithms. These are implemented with the PISTON component of the VTK-m framework, which uses Nvidia’s Thrust library to construct data-parallel algorithms that al- low a single implementation to be compiled to multiple backends to target a variety of multi-core and many-core architectures. Fi- nally, we compare the performance of our halo and center find- ing algorithms against the original HACC implementations on the Moonlight, Stampede, and Titan supercomputers. The portability of Thrust allowed the same code to run efficiently on each of these architectures. On Titan, the performance improvements using our code have enabled halo analysis to be performed on a very large data set (81923 particles across 16,384 nodes of Titan) for which analysis using only the existing CPU algorithms was not feasible.},
note = {LA-UR-15-22202},
keywords = {cosmology, halo finding, many-core, Programming Techniques},
pubstate = {published},
tppubtype = {inproceedings}
}
2014
Sewell, Christopher; Heitmann, Katrin; Lo, Li-Ta; Habib, Salman; Ahrens, James
Portable Parallel Halo and Center Finders for HACC Presentation
31.07.2014, (LA-UR-14-25437).
Abstract | Links | BibTeX | Tags: halo finding, PISTON, VTK-m
@misc{Sewell2014b,
title = {Portable Parallel Halo and Center Finders for HACC},
author = {Christopher Sewell and Katrin Heitmann and Li-Ta Lo and Salman Habib and James Ahrens},
url = {http://datascience.dsscale.org/wp-content/uploads/2016/08/Portable_Parallel_Halo_and_Center_Finders_for_HACC.pdf},
year = {2014},
date = {2014-07-31},
abstract = {This presentation describes our work on finding halos and halo centers for the HACC cosmology code using our portable, data-parallel framework, which allows us to run on accelerators such as GPUs, providing significant speed-up. This work, which is part of the SDAV VTK-m project, enabled halo analysis to be performed on a very large data set (8192^3 particles across 16,384 nodes on Titan) for which analysis using the traditional CPU algorithms was not feasible.},
note = {LA-UR-14-25437},
keywords = {halo finding, PISTON, VTK-m},
pubstate = {published},
tppubtype = {presentation}
}
Sewell, Christopher; Lo, Li-Ta; Heitmann, Katrin; Habib, Salman; Ahrens, James
Utilizing Many-Core Accelerators for Halo and Center Finding within a Cosmology Simulation Proceedings Article
In: Proceedings of the IEEE Symposium on Large Data Analysis and Visualization, IEEE Press, Chicago, Illinois, 2015, (LA-UR-15-22202).
@inproceedings{Sewell:2015a,
title = {Utilizing Many-Core Accelerators for Halo and Center Finding within a Cosmology Simulation},
author = {Christopher Sewell and Li-Ta Lo and Katrin Heitmann and Salman Habib and James Ahrens},
url = {http://datascience.dsscale.org/wp-content/uploads/2016/06/UtilizingMany-CoreAcceleratorsForHaloAndCenterFindingWithinACosmologySimulation.pdf},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the IEEE Symposium on Large Data Analysis and Visualization},
publisher = {IEEE Press},
address = {Chicago, Illinois},
series = {LDAV '15},
abstract = {Efficiently finding and computing statistics about “halos” (regions of high density) are essential analysis steps for N-body cosmology simulations. However, in state-of-the-art simulation codes, these analysis operators do not currently take advantage of the shared- memory data-parallelism available on multi-core and many-core ar- chitectures. The Hybrid / Hardware Accelerated Cosmology Code (HACC) is designed as an MPI+X code, but the analysis operators are parallelized only among MPI ranks, because of the difficulty in porting different X implementations (e.g., OpenMP, CUDA) across all architectures on which it is run. In this paper, we present portable data-parallel algorithms for several variations of halo find- ing and halo center finding algorithms. These are implemented with the PISTON component of the VTK-m framework, which uses Nvidia’s Thrust library to construct data-parallel algorithms that al- low a single implementation to be compiled to multiple backends to target a variety of multi-core and many-core architectures. Fi- nally, we compare the performance of our halo and center find- ing algorithms against the original HACC implementations on the Moonlight, Stampede, and Titan supercomputers. The portability of Thrust allowed the same code to run efficiently on each of these architectures. On Titan, the performance improvements using our code have enabled halo analysis to be performed on a very large data set (81923 particles across 16,384 nodes of Titan) for which analysis using only the existing CPU algorithms was not feasible.},
note = {LA-UR-15-22202},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Sewell, Christopher; Heitmann, Katrin; Lo, Li-Ta; Habib, Salman; Ahrens, James
Portable Parallel Halo and Center Finders for HACC Presentation
31.07.2014, (LA-UR-14-25437).
@misc{Sewell2014b,
title = {Portable Parallel Halo and Center Finders for HACC},
author = {Christopher Sewell and Katrin Heitmann and Li-Ta Lo and Salman Habib and James Ahrens},
url = {http://datascience.dsscale.org/wp-content/uploads/2016/08/Portable_Parallel_Halo_and_Center_Finders_for_HACC.pdf},
year = {2014},
date = {2014-07-31},
abstract = {This presentation describes our work on finding halos and halo centers for the HACC cosmology code using our portable, data-parallel framework, which allows us to run on accelerators such as GPUs, providing significant speed-up. This work, which is part of the SDAV VTK-m project, enabled halo analysis to be performed on a very large data set (8192^3 particles across 16,384 nodes on Titan) for which analysis using the traditional CPU algorithms was not feasible.},
note = {LA-UR-14-25437},
keywords = {},
pubstate = {published},
tppubtype = {presentation}
}