__constant__ float M_; // <= This declares M_ on the constant memory of all CUDA visible devices
__global__ void showMKernel() {
printf("****** M_ = %f\n", M_);
int main()
float M = 2.0;
// Make sure that the return values are properly checked for cudaSuccess ...
int deviceCount = -1;
cudaGetDeviceCount(&deviceCount);
// Set M_ on the constant memory of each device:
for (int i = 0; i < deviceCount; i++) {
cudaSetDevice(i);
cudaMemcpyToSymbol(M_, &M, sizeof(float), 0, cudaMemcpyDefault);
// Now, run a kernel to show M_:
for (int i = 0; i < deviceCount; i++)
cudaSetDevice(i);
printf("Device %g :\n", i);
showMKernel<<<1,1>>>();
cudaDeviceSynchronize();
}
它返回:
Device 0 :
****** M = 2.000000
Device 1 :
****** M = 2.000000
// so on for other devices
现在,如果我把
// Set M_ on the constant memory of each device:
for (int i = 0; i < deviceCount; i++) {
cudaSetDevice(i);