Alsvinn  0.5.3
The fast FVM simulator with UQ support
set_cuda_device.hpp
Go to the documentation of this file.
1 /* Copyright (c) 2018 ETH Zurich, Kjetil Olsen Lye
2  * This program is free software: you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation, either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program. If not, see <http://www.gnu.org/licenses/>.
14  */
15 
16 #pragma once
17 #include "alsutils/config.hpp"
18 #ifdef ALSVINN_HAVE_CUDA
19 #define ENV_LOCAL_RANK "OMPI_COMM_WORLD_LOCAL_RANK"
20 #include <cuda_runtime.h>
21 #include "alsutils/log.hpp"
23 #include <iostream>
24 #include <cstdlib>
25 
26 namespace alsutils {
27 namespace mpi {
34 inline void setCudaDevice() {
35  int deviceCount = -1;
36 
37  try {
38  CUDA_SAFE_CALL_SILENT(cudaGetDeviceCount(&deviceCount));
39  ALSVINN_LOG(INFO, "Number of GPUs on node: " << deviceCount);
40 
41  if (deviceCount > 1) {
42 
43 
44  // We need to do this before we call MPI_Init
45  // (I think at least, hint about this found in
46  // https://devtalk.nvidia.com/default/topic/752046/teaching-and-curriculum-support/multi-gpu-system-running-mpi-cuda-/
47  // )
48  //
49  // Testing revealed this to "probably" be necessary. (2018-05-03 on the Leonhard Cluster at ETHZ)
50  const char* rankAsString = std::getenv(ENV_LOCAL_RANK);
51 
52  if (rankAsString) {
53 
54  int mpiRank = std::atoi(rankAsString);
55 
56  // Reset the state completely (This may not be neccessary)
57  CUDA_SAFE_CALL_SILENT(cudaDeviceReset());
58  CUDA_SAFE_CALL_SILENT(cudaThreadExit());
59 
60  // Round robin kind of allocation (here we assume the mpi nodes gets assigned
61  // rank in this way)
62  const int device = mpiRank % deviceCount;
63  ALSVINN_LOG(INFO, "Setting CUDA device to " << device);
64  CUDA_SAFE_CALL_SILENT(cudaSetDevice(device));
65 
66  // Make sure it worked
67  int currentDevice = -1;
68  CUDA_SAFE_CALL_SILENT(cudaGetDevice(&currentDevice));
69  ALSVINN_LOG(INFO, "Current CUDA device is " << currentDevice);
70 
71  if (currentDevice != device) {
72  ALSVINN_LOG(WARNING, "Could not update current device (" << device << ", " <<
73  currentDevice << ")");
74  }
75  }
76  }
77  } catch (std::runtime_error& e) {
79  "(Ignore this if you are not running with CUDA). Failed setting CUDA GPU: " <<
80  e.what());
81  }
82 }
83 }
84 }
85 #else
86 
87 namespace alsutils {
88 namespace mpi {
89 inline void setCudaDevice() {
90  // dummy function for when we do not have cuda
91 
92 }
93 }
94 }
95 #endif
void setCudaDevice()
Definition: set_cuda_device.hpp:89
#define INFO
Definition: log.hpp:28
#define CUDA_SAFE_CALL_SILENT(x)
Does the same as CUDA_SAFE_CALL, but doesn&#39;t print an error message.
Definition: cuda_safe_call.hpp:31
Various utilities for mpi and cuda.
Definition: Factory.hpp:3
#define WARNING
Definition: log.hpp:30
#define ALSVINN_LOG(severity, message)
Definition: log.hpp:36