// MP1 include __global__ void vecAdd(float * in1, float * in2, float * out, int len) { //@@ Insert code to implement vector addition here int i = threadIdx.x + blockDim.x * blockIdx.x; if (i>>(deviceInput1, deviceInput2, deviceOutput, inputLength); vecAdd<<>>(deviceInput1, deviceInput2, deviceOutput, n); cudaThreadSynchronize(); wbTime_stop(Compute, "Performing CUDA computation"); wbTime_start(Copy, "Copying output memory to the CPU"); //@@ Copy the GPU memory back to the CPU here //cudaMemcpy(hostOutput, deviceOutput, inputLength, cudaMemcpyDeviceToHost); cudaMemcpy(hostOutput, deviceOutput, n, cudaMemcpyDeviceToHost); wbTime_stop(Copy, "Copying output memory to the CPU"); wbTime_start(GPU, "Freeing GPU Memory"); //@@ Free the GPU memory here cudaFree(deviceInput1); cudaFree(deviceInput2); cudaFree(deviceOutput); wbTime_stop(GPU, "Freeing GPU Memory"); wbSolution(args, hostOutput, inputLength); free(hostInput1); free(hostInput2); free(hostOutput); return 0; }