+-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | No running processes found | +-----------------------------------------------------------------------------------------+
然后是配置 CUDA 环境,理论上直接用课程给出的安装脚本就行(Ubuntu
22.04),但我试了之后没效果。后来参照官网 WSL 的方式(CUDA
Toolkit
12.4)再装了一边依然无效,查了一下似乎是没添加环境变量,我用的
zsh,课程安装脚本是 bash,要自己手动添加环境变量。使用
nvcc -V 查看是否能识别到 CUDA 编译器
1 2 3 4 5 6
~ » nvcc -V mizukicry@S-Terminal nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2024 NVIDIA Corporation Built on Thu_Mar_28_02:18:24_PDT_2024 Cuda compilation tools, release 12.4, V12.4.131 Build cuda_12.4.r12.4/compiler.34097967_0
// saxpyCuda -- // // This function is regular C code running on the CPU. It allocates // memory on the GPU using CUDA API functions, uses CUDA API functions // to transfer data from the CPU's memory address space to GPU memory // address space, and launches the CUDA kernel function on the GPU. voidsaxpyCuda(int N, float alpha, float *xarray, float *yarray, float *resultarray){
// must read both input arrays (xarray and yarray) and write to // output array (resultarray) int totalBytes = sizeof(float) * 3 * N;
// compute number of blocks and threads per block. In this // application we've hardcoded thread blocks to contain 512 CUDA // threads. constint threadsPerBlock = 512;
// Notice the round up here. The code needs to compute the number // of threads blocks needed such that there is one thread per // element of the arrays. This code is written to work for values // of N that are not multiples of threadPerBlock. constint blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
// These are pointers that will be pointers to memory allocated // *one the GPU*. You should allocate these pointers via // cudaMalloc. You can access the resulting buffers from CUDA // device kernel code (see the kernel function saxpy_kernel() // above) but you cannot access the contents these buffers from // this thread. CPU threads cannot issue loads and stores from GPU // memory! float *device_x = nullptr; float *device_y = nullptr; float *device_result = nullptr;
// // CS149 TODO: allocate device memory buffers on the GPU using cudaMalloc. // // We highly recommend taking a look at NVIDIA's // tutorial, which clearly walks you through the few lines of code // you need to write for this part of the assignment: // // https://devblogs.nvidia.com/easy-introduction-cuda-c-and-c/ // cudaMalloc(&device_x, sizeof(float) * N); cudaMalloc(&device_y, sizeof(float) * N); cudaMalloc(&device_result, sizeof(float) * N);
// start timing after allocation of device memory double startTime = CycleTimer::currentSeconds();
// // CS149 TODO: copy input arrays to the GPU using cudaMemcpy // cudaMemcpy(device_x, xarray, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(device_y, yarray, sizeof(float) * N, cudaMemcpyHostToDevice);
// run CUDA kernel. (notice the <<< >>> brackets indicating a CUDA // kernel launch) Execution on the GPU occurs here. saxpy_kernel<<<blocks, threadsPerBlock>>>(N, alpha, device_x, device_y, device_result);
// // CS149 TODO: copy result from GPU back to CPU using cudaMemcpy // cudaMemcpy(resultarray, device_result, sizeof(float) * N, cudaMemcpyDeviceToHost);
// end timing after result has been copied back into host memory double endTime = CycleTimer::currentSeconds();
cudaError_t errCode = cudaPeekAtLastError(); if (errCode != cudaSuccess) { fprintf(stderr, "WARNING: A CUDA error occured: code=%d, %s\n", errCode, cudaGetErrorString(errCode)); }
double overallDuration = endTime - startTime; printf("Effective BW by CUDA saxpy: %.3f ms\t\t[%.3f GB/s]\n", 1000.f * overallDuration, GBPerSec(totalBytes, overallDuration));
// // CS149 TODO: free memory buffers on the GPU using cudaFree // cudaFree(device_x); cudaFree(device_y); cudaFree(device_result); }
Part 2: CUDA
Warm-Up 2: Parallel Prefix-Sum (10 pts)
首先是实现课件上的 Exclusive Prefix Sum
并行算法(scan/scan.cu 中的 exclusive_scan
函数)
说实话这个算法课件也没解释,看起来像是个树状数组建树的思想,不过没太看懂,就直接照着
PA 给出的伪代码翻译了
__global__ voidupsweep(int *result, int numThreads, int stride){ int threadId = blockIdx.x * blockDim.x + threadIdx.x; if (threadId < numThreads) { int index = (threadId + 1) * stride - 1; result[index] += result[index - (stride >> 1)]; } }
__global__ voiddownsweep(int *result, int numThreads, int stride){ int threadId = blockIdx.x * blockDim.x + threadIdx.x; if (threadId < numThreads) { int index = (threadId + 1) * stride - 1; int t = result[index - (stride >> 1)]; result[index - (stride >> 1)] = result[index]; result[index] += t; } }
// exclusive_scan -- // // Implementation of an exclusive scan on global memory array `input`, // with results placed in global memory `result`. // // N is the logical size of the input and output arrays, however // students can assume that both the start and result arrays we // allocated with next power-of-two sizes as described by the comments // in cudaScan(). This is helpful, since your parallel scan // will likely write to memory locations beyond N, but of course not // greater than N rounded up to the next power of 2. // // Also, as per the comments in cudaScan(), you can implement an // "in-place" scan, since the timing harness makes a copy of input and // places it in result voidexclusive_scan(int *input, int N, int *result){ N = nextPow2(N);
// upsweep phase for (int stride = 2; stride <= N; stride <<= 1) { int numThreads = N / stride; int numBlocks = (numThreads + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; upsweep<<<numBlocks, THREADS_PER_BLOCK>>>(result, numThreads, stride); cudaDeviceSynchronize(); }
// reset the last element to 0 cudaMemset(&result[N - 1], 0, sizeof(int)); cudaDeviceSynchronize();
// downsweep phase for (int stride = N; stride >= 2; stride >>= 1) { int numThreads = N / stride; int numBlocks = (numThreads + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; downsweep<<<numBlocks, THREADS_PER_BLOCK>>>(result, numThreads, stride); cudaDeviceSynchronize(); } }
1 2 3 4 5 6 7 8 9 10 11
~/codes/CS149/asst3/scan (master*) » ./cudaScan -n 4000000 mizukicry@S-Terminal --------------------------------------------------------- Found 1 CUDA devices Device 0: NVIDIA GeForce RTX 4050 Laptop GPU SMs: 20 Global mem: 6140 MB CUDA Cap: 8.9 --------------------------------------------------------- Array size: 4000000 Student GPU time: 2.802 ms Scan outputs are correct!
一开始我在 Device 里面用计算 index
判断是否越界,结果测试的时候发现似乎数组大小超过
4194304(2^22)就会报错,一开始以为是爆栈了,但解除栈限制也一样,排查时发现是因为
threadId 超出过多导致炸 int ,被坑惨了
__global__ voidfr_part1(int *input, int *a, int length){ int index = blockIdx.x * blockDim.x + threadIdx.x; a[index] = (index < length - 1 && input[index] == input[index + 1]) ? 1 : 0; }
__global__ voidfr_part2(int *a, int length, int *output){ int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < length - 1 && a[index] != a[index + 1]) { output[a[index]] = index; } }
// find_repeats -- // // Given an array of integers `device_input`, returns an array of all // indices `i` for which `device_input[i] == device_input[i+1]`. // // Returns the total number of pairs found intfind_repeats(int *device_input, int length, int *device_output){ int N = nextPow2(length);