这篇教程C++ CUDA_CHECK函数代码示例写得很实用,希望能帮到您。
本文整理汇总了C++中CUDA_CHECK函数的典型用法代码示例。如果您正苦于以下问题:C++ CUDA_CHECK函数的具体用法?C++ CUDA_CHECK怎么用?C++ CUDA_CHECK使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。 在下文中一共展示了CUDA_CHECK函数的28个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。 示例1: StopInternalThreadvoid InternalThread::StartInternalThread() { // TODO switch to failing once Caffe prefetch thread is persistent. // Threads should not be started and stopped repeatedly. // CHECK(!is_started()); StopInternalThread();#ifndef CPU_ONLY CUDA_CHECK(cudaGetDevice(&device_));#endif mode_ = Caffe::mode(); rand_seed_ = caffe_rng_rand(); solver_count_ = Caffe::solver_count(); root_solver_ = Caffe::root_solver(); try { thread_.reset(new boost::thread(&InternalThread::entry, this)); } catch (std::exception& e) { CHECK(false) << e.what(); }}
开发者ID:XinLiuNvidia,项目名称:caffe,代码行数:20,
示例2: transform void transform(Param<T> out, CParam<T> in, CParam<float> tf, const bool inverse) { const dim_type nimages = in.dims[2]; // Multiplied in src/backend/transform.cpp const dim_type ntransforms = out.dims[2] / in.dims[2]; // Copy transform to constant memory. CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0, cudaMemcpyDeviceToDevice)); dim3 threads(TX, TY, 1); dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y)); if (nimages > 1) { blocks.x *= nimages; } if (ntransforms > 1) { blocks.y *= ntransforms; } if(inverse) { transform_kernel<T, true><<<blocks, threads>>>(out, in, nimages, ntransforms); } else {
开发者ID:EasonYi,项目名称:arrayfire,代码行数:20,
示例3: LOGfloat Timer::MilliSeconds() { if (!has_run_at_least_once()) { LOG(WARNING) << "Timer has never been run before reading time."; return 0; } if (running()) { Stop(); } if (Caffe::mode() == Caffe::GPU) {#ifndef CPU_ONLY CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_, stop_gpu_));#else NO_GPU;#endif } else { elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds(); } return elapsed_milliseconds_;}
开发者ID:azrael417,项目名称:caffe,代码行数:20,
示例4: CUDA_CHECKvoid CuDNNConvolutionLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionLayer<Dtype>::LayerSetUp(bottom, top); // Initialize CUDA streams and cuDNN. stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { CUDA_CHECK(cudaStreamCreate(&stream_[g])); CUDNN_CHECK(cudnnCreate(&handle_[g])); CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); } // Set the indexing parameters. weight_offset_ = (this->num_output_ / this->group_) * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_; bias_offset_ = (this->num_output_ / this->group_); // Create filter descriptor. cudnn::createFilterDesc<Dtype>(&filter_desc_, this->num_output_ / this->group_, this->channels_ / this->group_, this->kernel_h_, this->kernel_w_); // Create tensor descriptor(s) for data and corresponding convolution(s). for (int i = 0; i < bottom.size(); i++) { cudnnTensor4dDescriptor_t bottom_desc; cudnn::createTensor4dDesc<Dtype>(&bottom_desc); bottom_descs_.push_back(bottom_desc); cudnnTensor4dDescriptor_t top_desc; cudnn::createTensor4dDesc<Dtype>(&top_desc); top_descs_.push_back(top_desc); cudnnConvolutionDescriptor_t conv_desc; cudnn::createConvolutionDesc<Dtype>(&conv_desc); conv_descs_.push_back(conv_desc); } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::createTensor4dDesc<Dtype>(&bias_desc_); }}
开发者ID:13331151,项目名称:deeplab-public,代码行数:41,
示例5: morphArray<T> morph(const Array<T> &in, const Array<T> &mask) { const dim4 mdims = mask.dims(); if (mdims[0] != mdims[1]) CUDA_NOT_SUPPORTED("Rectangular masks are not supported"); if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported"); Array<T> out = createEmptyArray<T>(in.dims()); CUDA_CHECK(cudaMemcpyToSymbolAsync( kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0, cudaMemcpyDeviceToDevice, cuda::getActiveStream())); if (isDilation) kernel::morph<T, true>(out, in, mdims[0]); else kernel::morph<T, false>(out, in, mdims[0]); return out;}
开发者ID:9prady9,项目名称:arrayfire,代码行数:21,
示例6: caffe_copyvoid caffe_copy(const int N, const Dtype* X, Dtype* Y) { if (X != Y) { // If there are more than one openmp thread (we are in active region) // then checking Caffe::mode can create additional GPU Context // if (#ifdef _OPENMP (omp_in_parallel() == 0) &&#endif (Caffe::mode() == Caffe::GPU)) {#ifndef CPU_ONLY // NOLINT_NEXT_LINE(caffe/alt_fn) CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));#else NO_GPU;#endif } else { caffe_cpu_copy<Dtype>(N, X, Y); } }}
开发者ID:crobertob,项目名称:caffe,代码行数:21,
示例7: switch// 把数据放到cpu上inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_); memset(cpu_ptr_, 0, size_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; case HEAD_AT_GPU: if (cpu_ptr_ == NULL) { CaffeMallocHost(&cpu_ptr_, size_); own_cpu_data_ = true; } CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost)); head_ = SYNCED; break; case HEAD_AT_CPU: case SYNCED: break; }}
开发者ID:clarencezhang,项目名称:caffe-windows-multilabels,代码行数:22,
示例8: normalizeGPULaunch/*// Launch GPU kernel of normalize//// API// int normalizeGPULaunch(const int alfa, CvLSVMFeatureMapGPU *dev_map_in, CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out, CUstream stream);// INPUT// alfa// dev_map_in// dev_norm// stream// OUTPUT// dev_map_out// RESULT// Error status*/int normalizeGPULaunch(const float alfa, CvLSVMFeatureMapGPU *dev_map_in, CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out, CUstream stream){ int sizeX, sizeY; int thread_num_x, thread_num_y, thread_num_z; int block_num_x, block_num_y, block_num_z; int sharedMemBytes; CUresult res; sizeX = dev_map_in->sizeX; sizeY = dev_map_in->sizeY; void *normalize_kernel_arg[] = { (void *) &dev_map_in->map, (void *) &dev_norm->map, (void *) &dev_map_out->map, (void *) &sizeX, (void *) &sizeY, (void *) &alfa, }; thread_num_x = (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num); thread_num_y = (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num); thread_num_z = 1; block_num_x = sizeX / thread_num_x; block_num_y = sizeY / thread_num_y; block_num_z = NUM_SECTOR * 2; if (sizeX % thread_num_x != 0) block_num_x++; if (sizeY % thread_num_y != 0) block_num_y++; sharedMemBytes = 0; res = cuLaunchKernel(normalizeAndTruncate_func[0], block_num_x, block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z, sharedMemBytes, stream, normalize_kernel_arg, NULL); CUDA_CHECK(res, "cuLaunchKernel(normalizeAndTruncate)"); return LATENT_SVM_OK;}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:57,
示例9: PCAFeatureMapsAddNullableBorderGPULaunch/*// Launch GPU kernel of PCA feature maps//// API// int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in, CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by, CUstream stream);// INPUT// dev_map_in// bx// by// stream// OUTPUT// dev_map_out// RESULT// Error status*/int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in, CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by, CUstream stream){ int sizeX, sizeY, p; int thread_num_x, thread_num_y, thread_num_z; int block_num_x, block_num_y, block_num_z; int sharedMemBytes; CUresult res; sizeX = dev_map_in->sizeX; sizeY = dev_map_in->sizeY; p = dev_map_in->numFeatures; void *pca_kernel_arg[] = { (void *) &dev_map_in->map, (void *) &dev_map_out->map, (void *) &sizeX, (void *) &sizeY, (void *) &p, (void *) &bx, (void *) &by }; thread_num_x = (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num); thread_num_y = (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num); thread_num_z = 1; block_num_x = sizeX / thread_num_x; block_num_y = sizeY / thread_num_y; block_num_z = 1; if (sizeX % thread_num_x != 0) block_num_x++; if (sizeY % thread_num_y != 0) block_num_y++; sharedMemBytes = 0; res = cuLaunchKernel(PCAFeatureMapsAddNullableBorder_func[0], block_num_x, block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z, sharedMemBytes, stream, pca_kernel_arg, NULL); CUDA_CHECK(res, "cuLaunchKernel(PCAFeatureMaps)"); return LATENT_SVM_OK;}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:57,
示例10: switchinline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_); CHECK(cpu_ptr_ != 0) << "size " << size_; memset(cpu_ptr_, 0, size_); head_ = HEAD_AT_CPU; break;#if 0 case HEAD_AT_GPU: if (cpu_ptr_ == NULL) { CaffeMallocHost(&cpu_ptr_, size_); } CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost)); head_ = SYNCED; break;#endif case HEAD_AT_CPU: case SYNCED: break; }}
开发者ID:Devy001,项目名称:Caffe-mini,代码行数:22,
示例11: pinnedAlloc T* pinnedAlloc(const size_t &elements) { managerInit(); T* ptr = NULL; // Allocate the higher megabyte. Overhead of creating pinned memory is // more so we want more resuable memory. size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576; if (elements > 0) { // FIXME: Add better checks for garbage collection // Perhaps look at total memory available as a metric if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) { pinnedGarbageCollect(); } for(mem_iter iter = pinned_maps.begin(); iter != pinned_maps.end(); ++iter) { mem_info info = iter->second; if (info.is_free && info.bytes == alloc_bytes) { iter->second.is_free = false; pinned_used_bytes += alloc_bytes; return (T *)iter->first; } } // Perform garbage collection if memory can not be allocated if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) { pinnedGarbageCollect(); CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes)); } mem_info info = {false, false, alloc_bytes}; pinned_maps[ptr] = info; pinned_used_bytes += alloc_bytes; } return (T*)ptr; }
开发者ID:hxiaox,项目名称:arrayfire,代码行数:39,
示例12: LOGfloat Timer::MicroSeconds() { if (!has_run_at_least_once()) { LOG(WARNING)<< "Timer has never been run before reading time."; return 0; } if (running()) { Stop(); }#ifdef USE_CUDA if (Caffe::mode() == Caffe::GPU) { CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_, stop_gpu_)); // Cuda only measure milliseconds elapsed_microseconds_ = elapsed_milliseconds_ * 1000; } else {#endif elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();#ifdef USE_CUDA }#endif return elapsed_microseconds_;}
开发者ID:rickyHong,项目名称:CaffeForOpenCL,代码行数:22,
示例13: calculateNormGPULaunch/*// Launch GPU kernel of calculate norm//// API//int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in, CvLSVMFeatureMapGPU *dev_norm, CUstream stream)// INPUT// dev_map_in// stream// OUTPUT// dev_norm// RESULT// Error status*/int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in, CvLSVMFeatureMapGPU *dev_norm, CUstream stream){ int sizeX, sizeY, xp; int thread_num_x, thread_num_y, thread_num_z; int block_num_x, block_num_y, block_num_z; int sharedMemBytes; CUresult res; sizeX = dev_map_in->sizeX; sizeY = dev_map_in->sizeY; xp = dev_map_in->numFeatures; void *calc_norm_kernel_arg[] = { (void *) &dev_map_in->map, (void *) &dev_norm->map, (void *) &sizeX, (void *) &sizeY, (void *) &xp, }; thread_num_x = (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num); thread_num_y = (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num); thread_num_z = 1; block_num_x = sizeX / thread_num_x; block_num_y = sizeY / thread_num_y; block_num_z = 1; if (sizeX % thread_num_x != 0) block_num_x++; if (sizeY % thread_num_y != 0) block_num_y++; sharedMemBytes = 0; res = cuLaunchKernel(calculateNorm_func[0], block_num_x, block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z, sharedMemBytes, stream, calc_norm_kernel_arg, NULL); CUDA_CHECK(res, "cuLaunchKernel(calcuateNorm)"); return LATENT_SVM_OK;}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:53,
示例14: CUDA_CHECKvoid MPIComm::ThreadFunc(int device){#ifndef CPU_ONLY //LOG(ERROR)<<"device_id is "<<device; CUDA_CHECK(cudaSetDevice(device));#endif started_.store(true); MPIJob job; while (true){ mutex::scoped_lock lock(queue_mutex_); while( task_queue_.empty() && IsRunning()){ DLOG(INFO)<<"no job running, waiting on cond"; cond_work_.wait(lock); } lock.unlock(); DLOG(INFO)<<"Cond fulfilled, dispatching job"; if (IsRunning()){ job = task_queue_.front(); DLOG(INFO)<<task_queue_.size(); DispatchJob(job); mutex::scoped_lock pop_lock(queue_mutex_); task_queue_.pop(); pop_lock.unlock(); cond_finish_.notify_one(); DLOG(INFO)<<"job finished, poped taskqueue"; }else{ break; } } // finish remaining jobs while (!task_queue_.empty()){ boost::lock_guard<mutex> lock(queue_mutex_); job = task_queue_.front(); task_queue_.pop(); DispatchJob(job); }}
开发者ID:xiangqiaolxq,项目名称:caffe-parallel,代码行数:39,
示例15: morphArray<T> morph(const Array<T> &in, const Array<T> &mask){ const dim4 mdims = mask.dims(); if (mdims[0] != mdims[1]) AF_ERROR("Only square masks are supported in cuda morph currently", AF_ERR_SIZE); if (mdims[0] > 19) AF_ERROR("Upto 19x19 square kernels are only supported in cuda currently", AF_ERR_SIZE); Array<T> out = createEmptyArray<T>(in.dims()); CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0, cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); if (isDilation) kernel::morph<T, true >(out, in, mdims[0]); else kernel::morph<T, false>(out, in, mdims[0]); return out;}
开发者ID:hxiaox,项目名称:arrayfire,代码行数:23,
示例16: memAlloc T* memAlloc(const size_t &elements) { int n = getActiveDeviceId(); T* ptr = NULL; size_t alloc_bytes = divup(sizeof(T) * elements, 1024) * 1024; if (elements > 0) { // FIXME: Add better checks for garbage collection // Perhaps look at total memory available as a metric if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes >= MAX_BYTES) { garbageCollect(); } for(mem_iter iter = memory_maps[n].begin(); iter != memory_maps[n].end(); iter++) { mem_info info = iter->second; if (info.is_free && info.bytes == alloc_bytes) { iter->second.is_free = false; used_bytes += alloc_bytes; return (T *)iter->first; } } // Perform garbage collection if memory can not be allocated if (cudaMalloc((void **)&ptr, alloc_bytes) != cudaSuccess) { garbageCollect(); CUDA_CHECK(cudaMalloc((void **)(&ptr), alloc_bytes)); } mem_info info = {false, alloc_bytes}; memory_maps[n][ptr] = info; used_bytes += alloc_bytes; } return ptr; }
开发者ID:maolingao,项目名称:arrayfire,代码行数:37,
示例17: CUDA_CHECKSocketBuffer* SocketBuffer::Read(bool data) { // Pop the message from local queue QueuedMessage* qm = NULL; if(data) { qm = reinterpret_cast<QueuedMessage*> (this->channel_->receive_queue.pop());#ifndef CPU_ONLY // Copy the received buffer to GPU memory CUDA_CHECK(cudaMemcpy(this->addr(), qm->buffer, // NOLINT(caffe/alt_fn) qm->size, cudaMemcpyHostToDevice)); // NOLINT(caffe/alt_fn)#else //caffe_copy(qm->size, qm->buffer, this->addr_); memcpy(this->addr_, qm->buffer, qm->size);#endif } else { qm = reinterpret_cast<QueuedMessage*> (this->channel_->receive_queue_ctrl.pop()); } // Free up the buffer and the wrapper object if(data) delete qm->buffer; delete qm; return this;}
开发者ID:Aravindreddy986,项目名称:CaffeOnSpark,代码行数:24,
示例18: CUDA_CHECKP2PSync<Dtype>::~P2PSync() {#ifndef CPU_ONLY int initial_device; CUDA_CHECK(cudaGetDevice(&initial_device)); const int self = solver_->param().device_id(); CUDA_CHECK(cudaSetDevice(self)); if (parent_) { CUDA_CHECK(cudaFree(parent_grads_)); const int peer = parent_->solver_->param().device_id(); int access; CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer)); if (access) { CUDA_CHECK(cudaDeviceDisablePeerAccess(peer)); } } CUDA_CHECK(cudaSetDevice(initial_device));#endif}
开发者ID:flair2005,项目名称:Caffe-Solution,代码行数:20,
示例19: remainingvoid DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {#ifndef CPU_ONLY vector<int> remaining(devices); // Depth for reduction tree int remaining_depth = static_cast<int>(ceil(log2(remaining.size()))); // Group GPUs by board for (int d = 0; d < remaining_depth; ++d) { for (int i = 0; i < remaining.size(); ++i) { for (int j = i + 1; j < remaining.size(); ++j) { cudaDeviceProp a, b; CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i])); CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j])); if (a.isMultiGpuBoard && b.isMultiGpuBoard) { if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) { pairs->push_back(DevicePair(remaining[i], remaining[j])); DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j]; remaining.erase(remaining.begin() + j); break; } } } } } ostringstream s; for (int i = 0; i < remaining.size(); ++i) { s << (i ? ", " : "") << remaining[i]; } DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str(); // Group by P2P accessibility remaining_depth = ceil(log2(remaining.size())); for (int d = 0; d < remaining_depth; ++d) { for (int i = 0; i < remaining.size(); ++i) { for (int j = i + 1; j < remaining.size(); ++j) { int access; CUDA_CHECK( cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j])); if (access) { pairs->push_back(DevicePair(remaining[i], remaining[j])); DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j]; remaining.erase(remaining.begin() + j); break; } } } } s.str(""); for (int i = 0; i < remaining.size(); ++i) { s << (i ? ", " : "") << remaining[i]; } DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str(); // Group remaining remaining_depth = ceil(log2(remaining.size())); for (int d = 0; d < remaining_depth; ++d) { for (int i = 0; i < remaining.size(); ++i) { pairs->push_back(DevicePair(remaining[i], remaining[i + 1])); DLOG(INFO) << "Remaining pair: " << remaining[i] << ":" << remaining[i + 1]; remaining.erase(remaining.begin() + i + 1); } } // Should only be the parent node remaining CHECK_EQ(remaining.size(), 1); pairs->insert(pairs->begin(), DevicePair(-1, remaining[0])); CHECK(pairs->size() == devices.size()); for (int i = 0; i < pairs->size(); ++i) { CHECK((*pairs)[i].parent() != (*pairs)[i].device()); for (int j = i + 1; j < pairs->size(); ++j) { CHECK((*pairs)[i].device() != (*pairs)[j].device()); } }#else NO_GPU;#endif}
开发者ID:flair2005,项目名称:Caffe-Solution,代码行数:81,
示例20: orbvoid orb(unsigned* out_feat, float** d_x, float** d_y, float** d_score, float** d_ori, float** d_size, unsigned** d_desc, std::vector<unsigned>& feat_pyr, std::vector<float*>& d_x_pyr, std::vector<float*>& d_y_pyr, std::vector<unsigned>& lvl_best, std::vector<float>& lvl_scl, std::vector<CParam<T> >& img_pyr, const float fast_thr, const unsigned max_feat, const float scl_fctr, const unsigned levels){ unsigned patch_size = REF_PAT_SIZE; unsigned max_levels = feat_pyr.size(); // In future implementations, the user will be capable of passing his // distribution instead of using the reference one //CUDA_CHECK(cudaMemcpyToSymbol(d_ref_pat, h_ref_pat, 256 * 4 * sizeof(int), 0, cudaMemcpyHostToDevice)); std::vector<float*> d_score_pyr(max_levels); std::vector<float*> d_ori_pyr(max_levels); std::vector<float*> d_size_pyr(max_levels); std::vector<unsigned*> d_desc_pyr(max_levels); std::vector<unsigned*> d_idx_pyr(max_levels); unsigned total_feat = 0; // Calculate a separable Gaussian kernel unsigned gauss_len = 9; convAccT* h_gauss = new convAccT[gauss_len]; gaussian1D(h_gauss, gauss_len, 2.f); Param<convAccT> gauss_filter; gauss_filter.dims[0] = gauss_len; gauss_filter.strides[0] = 1; for (int k = 1; k < 4; k++) { gauss_filter.dims[k] = 1; gauss_filter.strides[k] = gauss_filter.dims[k - 1] * gauss_filter.strides[k - 1]; } dim_type gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3]; gauss_filter.ptr = memAlloc<convAccT>(gauss_elem); CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(convAccT), cudaMemcpyHostToDevice)); delete[] h_gauss; for (int i = 0; i < (int)max_levels; i++) { if (feat_pyr[i] == 0 || lvl_best[i] == 0) { if (i > 0) memFree((T*)img_pyr[i].ptr); continue; } unsigned* d_usable_feat = memAlloc<unsigned>(1); CUDA_CHECK(cudaMemset(d_usable_feat, 0, sizeof(unsigned))); float* d_x_harris = memAlloc<float>(feat_pyr[i]); float* d_y_harris = memAlloc<float>(feat_pyr[i]); float* d_score_harris = memAlloc<float>(feat_pyr[i]); // Calculate Harris responses // Good block_size >= 7 (must be an odd number) dim3 threads(THREADS_X, THREADS_Y); dim3 blocks(divup(feat_pyr[i], threads.x), 1); harris_response<T,false><<<blocks, threads>>>(d_x_harris, d_y_harris, d_score_harris, NULL, d_x_pyr[i], d_y_pyr[i], NULL, feat_pyr[i], d_usable_feat, img_pyr[i], 7, 0.04f, patch_size); POST_LAUNCH_CHECK(); unsigned usable_feat = 0; CUDA_CHECK(cudaMemcpy(&usable_feat, d_usable_feat, sizeof(unsigned), cudaMemcpyDeviceToHost)); memFree(d_x_pyr[i]); memFree(d_y_pyr[i]); memFree(d_usable_feat); feat_pyr[i] = usable_feat; if (feat_pyr[i] == 0) { memFree(d_x_harris); memFree(d_y_harris); memFree(d_score_harris); if (i > 0) memFree((T*)img_pyr[i].ptr); continue; } Param<float> harris_sorted; Param<unsigned> harris_idx; harris_sorted.dims[0] = harris_idx.dims[0] = feat_pyr[i]; harris_sorted.strides[0] = harris_idx.strides[0] = 1;//.........这里部分代码省略.........
开发者ID:pavanky,项目名称:arrayfire,代码行数:101,
示例21: getFeatureMapsGPUStream/*// Getting feature map for the selected subimage in GPU//// API//int getFeatureMapsGPUStream(const int numStep, const int k, CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map, CUstream *streams)// INPUT// numStep// k// devs_img// streams// OUTPUT// devs_map// RESULT// Error status*/int getFeatureMapsGPUStream(const int numStep, const int k, CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map, CUstream *streams){ int sizeX, sizeY; int p, px; int height, width; int i, j; int *nearest; float *w, a_x, b_x; int size_r, size_alfa, size_nearest, size_w, size_map; CUresult res; CvLSVMFeatureMapGPU **devs_r, **devs_alfa; CUdeviceptr dev_nearest, dev_w; px = 3 * NUM_SECTOR; p = px; size_nearest = k; size_w = k * 2; devs_r = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * numStep); devs_alfa = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * numStep); nearest = (int *) malloc(sizeof(int) * size_nearest); w = (float *) malloc(sizeof(float) * size_w); // initialize "nearest" and "w" for (i = 0; i < k / 2; i++) { nearest[i] = -1; }/*for(i = 0; i < k / 2; i++)*/ for (i = k / 2; i < k; i++) { nearest[i] = 1; }/*for(i = k / 2; i < k; i++)*/ for (j = 0; j < k / 2; j++) { b_x = k / 2 + j + 0.5f; a_x = k / 2 - j - 0.5f; w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x)); w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x)); }/*for(j = 0; j < k / 2; j++)*/ for (j = k / 2; j < k; j++) { a_x = j - k / 2 + 0.5f; b_x = -j + k / 2 - 0.5f + k; w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x)); w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x)); }/*for(j = k / 2; j < k; j++)*/ res = cuMemAlloc(&dev_nearest, sizeof(int) * size_nearest); CUDA_CHECK(res, "cuMemAlloc(dev_nearest)"); res = cuMemAlloc(&dev_w, sizeof(float) * size_w); CUDA_CHECK(res, "cuMemAlloc(dev_w)"); res = cuMemcpyHtoDAsync(dev_nearest, nearest, sizeof(int) * size_nearest, streams[numStep - 1]); res = cuMemcpyHtoDAsync(dev_w, w, sizeof(float) * size_w, streams[numStep - 1]); // allocate device memory for (i = 0; i < numStep; i++) { width = devs_img[i]->sizeX; height = devs_img[i]->sizeY; allocFeatureMapObjectGPU<float>(&devs_r[i], width, height, 1); allocFeatureMapObjectGPU<int>(&devs_alfa[i], width, height, 2); } // excute async for (i = 0; i < numStep; i++) { // initialize "map", "r" and "alfa" width = devs_img[i]->sizeX; height = devs_img[i]->sizeY; sizeX = width / k;//.........这里部分代码省略.........
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:101,
示例22: PCAFeatureMapsGPUStream/*// Feature map reduction in GPU// In each cell we reduce dimension of the feature vector// according to original paper special procedure//// API//int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps, CUstream *streams)// INPUT// numStep// bx// by// devs_map_in// streams// OUTPUT// feature_maps// RESULT// Error status*/int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps, CUstream *streams){ int sizeX, sizeY, pp; int size_map_pca; int i; CUresult res; CvLSVMFeatureMapGPU **devs_map_pca; pp = NUM_SECTOR * 3 + 4; devs_map_pca = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); // allocate memory for (i = 0; i < numStep; i++) { sizeX = devs_map_in[i]->sizeX + 2 * bx; sizeY = devs_map_in[i]->sizeY + 2 * by; size_map_pca = sizeX * sizeY * pp; allocFeatureMapObject(&feature_maps[i], sizeX, sizeY, pp); allocFeatureMapObjectGPU<float>(&devs_map_pca[i], sizeX, sizeY, pp); } // exucute async for (i = 0; i < numStep; i++) { sizeX = devs_map_pca[i]->sizeX; sizeY = devs_map_pca[i]->sizeY; size_map_pca = sizeX * sizeY * pp; // initilize device memory value of 0 res = cuMemsetD32Async(devs_map_pca[i]->map, 0, size_map_pca, streams[i]); CUDA_CHECK(res, "cuMemset(dev_map_pca)"); // launch kernel PCAFeatureMapsAddNullableBorderGPULaunch(devs_map_in[i], devs_map_pca[i], bx, by, streams[i]); } for (i = 0; i < numStep; i++) { sizeX = devs_map_pca[i]->sizeX; sizeY = devs_map_pca[i]->sizeY; size_map_pca = sizeX * sizeY * pp; // copy memory from device to host res = cuMemcpyDtoHAsync(feature_maps[i]->map, devs_map_pca[i]->map, sizeof(float) * size_map_pca, streams[i]); CUDA_CHECK(res, "cuMemcpyDtoH(dev_map_pca)"); } // free device memory for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_map_pca[i]); } free(devs_map_pca); return LATENT_SVM_OK;}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:87,
示例23: normalizeAndTruncateGPUStream/*// Feature map Normalization and Truncation in GPU//// API//int normalizeAndTruncateGPUStream(const int numStep, const float alfa, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out, CUstream *streams)// INPUT// numStep// alfa// devs_map_in// streams// OUTPUT// devs_map_out// RESULT// Error status*/int normalizeAndTruncateGPUStream(const int numStep, const float alfa, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out, CUstream *streams){ int sizeX, sizeY, newSizeX, newSizeY, pp; int size_norm, size_map_out; int i; CUresult res; CvLSVMFeatureMapGPU **devs_norm; pp = NUM_SECTOR * 12; devs_norm = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); // allocate device memory for (i = 0; i < numStep; i++) { sizeX = devs_map_in[i]->sizeX; sizeY = devs_map_in[i]->sizeY; newSizeX = sizeX - 2; newSizeY = sizeY - 2; allocFeatureMapObjectGPU<float>(&devs_norm[i], sizeX, sizeY, 1); } // exucute async for (i = 0; i < numStep; i++) { sizeX = devs_map_in[i]->sizeX; sizeY = devs_map_in[i]->sizeY; newSizeX = sizeX - 2; newSizeY = sizeY - 2; size_norm = sizeX * sizeY; size_map_out = newSizeX * newSizeY * pp; // initilize device memory value of 0 res = cuMemsetD32Async(devs_norm[i]->map, 0, size_norm, streams[i]); CUDA_CHECK(res, "cuMemset(dev_norm)"); res = cuMemsetD32Async(devs_map_out[i]->map, 0, size_map_out, streams[i]); CUDA_CHECK(res, "cuMemset(dev_map_out)"); // launch kernel calculateNormGPULaunch(devs_map_in[i], devs_norm[i], streams[i]); } for (i = 0; i < numStep; i++) { // launch kernel normalizeGPULaunch(alfa, devs_map_in[i], devs_norm[i], devs_map_out[i], streams[i]); } // synchronize cuda stream for (i = 0; i < numStep; i++) { cuStreamSynchronize(streams[i]); } // free device memory for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_norm[i]); } free(devs_norm); return LATENT_SVM_OK;}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:89,
示例24: CHECK_GTvoid MultiStageMeanfieldLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { init_cpu = false; init_gpu = false; const caffe::MultiStageMeanfieldParameter meanfield_param = this->layer_param_.multi_stage_meanfield_param(); num_iterations_ = meanfield_param.num_iterations(); CHECK_GT(num_iterations_, 1) << "Number of iterations must be greater than 1."; theta_alpha_ = meanfield_param.theta_alpha(); theta_beta_ = meanfield_param.theta_beta(); theta_gamma_ = meanfield_param.theta_gamma(); count_ = bottom[0]->count(); num_ = bottom[0]->num(); channels_ = bottom[0]->channels(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); num_pixels_ = height_ * width_; LOG(INFO) << "This implementation has not been tested batch size > 1."; top[0]->Reshape(num_, channels_, height_, width_); // Initialize the parameters that will updated by backpropagation. if (this->blobs_.size() > 0) { LOG(INFO) << "Multimeanfield layer skipping parameter initialization."; } else { this->blobs_.resize(3);// blobs_[0] - spatial kernel weights, blobs_[1] - bilateral kernel weights, blobs_[2] - compatability matrix // Allocate space for kernel weights. this->blobs_[0].reset(new Blob<Dtype>(1, 1, channels_, channels_)); this->blobs_[1].reset(new Blob<Dtype>(1, 1, channels_, channels_)); caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[0]->mutable_cpu_data()); caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[1]->mutable_cpu_data()); // Initialize the kernels weights. The two files spatial.par and bilateral.par should be available. FILE * pFile; pFile = fopen("spatial.par", "r"); CHECK(pFile) << "The file 'spatial.par' is not found. Please create it with initial spatial kernel weights."; for (int i = 0; i < channels_; i++) { fscanf(pFile, "%lf", &this->blobs_[0]->mutable_cpu_data()[i * channels_ + i]); } fclose(pFile); pFile = fopen("bilateral.par", "r"); CHECK(pFile) << "The file 'bilateral.par' is not found. Please create it with initial bilateral kernel weights."; for (int i = 0; i < channels_; i++) { fscanf(pFile, "%lf", &this->blobs_[1]->mutable_cpu_data()[i * channels_ + i]); } fclose(pFile); // Initialize the compatibility matrix. this->blobs_[2].reset(new Blob<Dtype>(1, 1, channels_, channels_)); caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[2]->mutable_cpu_data()); // Initialize it to have the Potts model. for (int c = 0; c < channels_; ++c) { (this->blobs_[2]->mutable_cpu_data())[c * channels_ + c] = Dtype(-1.); } } float spatial_kernel[2 * num_pixels_]; float *spatial_kernel_gpu_; compute_spatial_kernel(spatial_kernel); spatial_lattice_.reset(new ModifiedPermutohedral()); spatial_norm_.Reshape(1, 1, height_, width_); Dtype* norm_data_gpu ; Dtype* norm_data; // Initialize the spatial lattice. This does not need to be computed for every image because we use a fixed size. switch (Caffe::mode()) { case Caffe::CPU: norm_data = spatial_norm_.mutable_cpu_data(); spatial_lattice_->init(spatial_kernel, 2, width_, height_); // Calculate spatial filter normalization factors. norm_feed_= new Dtype[num_pixels_]; caffe_set(num_pixels_, Dtype(1.0), norm_feed_); // pass norm_feed and norm_data to gpu spatial_lattice_->compute(norm_data, norm_feed_, 1); bilateral_kernel_buffer_ = new float[5 * num_pixels_]; init_cpu = true; break; #ifndef CPU_ONLY case Caffe::GPU: CUDA_CHECK(cudaMalloc((void**)&spatial_kernel_gpu_, 2*num_pixels_ * sizeof(float))) ; CUDA_CHECK(cudaMemcpy(spatial_kernel_gpu_, spatial_kernel, 2*num_pixels_ * sizeof(float), cudaMemcpyHostToDevice)) ; spatial_lattice_->init(spatial_kernel_gpu_, 2, width_, height_); CUDA_CHECK(cudaMalloc((void**)&norm_feed_, num_pixels_ * sizeof(Dtype))) ; caffe_gpu_set(num_pixels_, Dtype(1.0), norm_feed_); norm_data_gpu = spatial_norm_.mutable_gpu_data(); spatial_lattice_->compute(norm_data_gpu, norm_feed_, 1); norm_data = spatial_norm_.mutable_cpu_data(); CUDA_CHECK(cudaMalloc((void**)&bilateral_kernel_buffer_, 5 * num_pixels_ * sizeof(float))) ; CUDA_CHECK(cudaFree(spatial_kernel_gpu_)); init_gpu = true; break;//.........这里部分代码省略.........
开发者ID:hyenal,项目名称:crfasrnn,代码行数:101,
示例25: getPathOfFeaturePyramidGPUStream/*// Property Message//// API//static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step, int numStep, int startIndex, int sideLength, int bx, int by, CvLSVMFeaturePyramid **maps)// INPUT// image// step// numStep// startIndex// sideLength// bx// by// OUTPUT// maps// RESULT// Error status*/static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step, int numStep, int startIndex, int sideLength, int bx, int by, CvLSVMFeaturePyramid **maps){ CvLSVMFeatureMap **feature_maps; int i; int width, height, numChannels, sizeX, sizeY, p, pp, newSizeX, newSizeY; float *scales; CvLSVMFeatureMapGPU **devs_img, **devs_map_pre_norm, **devs_map_pre_pca; CUstream *streams; CUresult res; scales = (float *) malloc(sizeof(float) * (numStep)); devs_img = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); devs_map_pre_norm = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); devs_map_pre_pca = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); streams = (CUstream *) malloc(sizeof(CUstream) * (numStep)); feature_maps = (CvLSVMFeatureMap **) malloc( sizeof(CvLSVMFeatureMap *) * (numStep)); // allocate device memory for (i = 0; i < numStep; i++) { scales[i] = 1.0f / powf(step, (float) i); width = (int) (((float) image->width ) * scales[i] + 0.5); height = (int) (((float) image->height) * scales[i] + 0.5); numChannels = image->nChannels; sizeX = width / sideLength; sizeY = height / sideLength; p = NUM_SECTOR * 3; pp = NUM_SECTOR * 12; newSizeX = sizeX - 2; newSizeY = sizeY - 2; allocFeatureMapObjectGPU<float>(&devs_img[i], width, height, numChannels); allocFeatureMapObjectGPU<float>(&devs_map_pre_norm[i], sizeX, sizeY, p); allocFeatureMapObjectGPU<float>(&devs_map_pre_pca[i], newSizeX, newSizeY, pp); res = cuStreamCreate(&streams[i], CU_STREAM_DEFAULT); CUDA_CHECK(res, "cuStreamCreate(stream)"); } // excute main function resizeGPUStream(numStep, image, scales, devs_img, streams); getFeatureMapsGPUStream(numStep, sideLength, devs_img, devs_map_pre_norm, streams); normalizeAndTruncateGPUStream(numStep, Val_Of_Truncate, devs_map_pre_norm, devs_map_pre_pca, streams); PCAFeatureMapsGPUStream(numStep, bx, by, devs_map_pre_pca, feature_maps, streams); // synchronize cuda stream for (i = 0; i < numStep; i++) { cuStreamSynchronize(streams[i]); cuStreamDestroy(streams[i]); } for (i = 0; i < numStep; i++) { (*maps)->pyramid[startIndex + i] = feature_maps[i]; }/*for(i = 0; i < numStep; i++)*/ // free device memory for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_img[i]); freeFeatureMapObjectGPU(&devs_map_pre_norm[i]); freeFeatureMapObjectGPU(&devs_map_pre_pca[i]); } free(scales);//.........这里部分代码省略.........
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:101,
示例26: pocl_cuda_alloc_mem_objcl_intpocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr){ cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context); CUresult result; void *b = NULL; /* if memory for this global memory is not yet allocated -> do it */ if (mem_obj->device_ptrs[device->global_mem_id].mem_ptr == NULL) { cl_mem_flags flags = mem_obj->flags; if (flags & CL_MEM_USE_HOST_PTR) {#if defined __arm__ // cuMemHostRegister is not supported on ARN // Allocate device memory and perform explicit copies // before and after running a kernel result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size); CUDA_CHECK (result, "cuMemAlloc");#else result = cuMemHostRegister (host_ptr, mem_obj->size, CU_MEMHOSTREGISTER_DEVICEMAP); if (result != CUDA_SUCCESS && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED) CUDA_CHECK (result, "cuMemHostRegister"); result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, host_ptr, 0); CUDA_CHECK (result, "cuMemHostGetDevicePointer");#endif } else if (flags & CL_MEM_ALLOC_HOST_PTR) { result = cuMemHostAlloc (&mem_obj->mem_host_ptr, mem_obj->size, CU_MEMHOSTREGISTER_DEVICEMAP); CUDA_CHECK (result, "cuMemHostAlloc"); result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, mem_obj->mem_host_ptr, 0); CUDA_CHECK (result, "cuMemHostGetDevicePointer"); } else { result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size); if (result != CUDA_SUCCESS) { const char *err; cuGetErrorName (result, &err); POCL_MSG_PRINT2 (__FUNCTION__, __LINE__, "-> Failed to allocate memory: %s/n", err); return CL_MEM_OBJECT_ALLOCATION_FAILURE; } } if (flags & CL_MEM_COPY_HOST_PTR) { result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem_obj->size); CUDA_CHECK (result, "cuMemcpyHtoD"); } mem_obj->device_ptrs[device->global_mem_id].mem_ptr = b; mem_obj->device_ptrs[device->global_mem_id].global_mem_id = device->global_mem_id; } /* copy already allocated global mem info to devices own slot */ mem_obj->device_ptrs[device->dev_id] = mem_obj->device_ptrs[device->global_mem_id]; return CL_SUCCESS;}
开发者ID:jrprice,项目名称:pocl,代码行数:70,
示例27: TestSAXPYCUresultTestSAXPY( chCUDADevice *chDevice, size_t N, float alpha ){ CUresult status; CUdeviceptr dptrOut = 0; CUdeviceptr dptrIn = 0; float *hostOut = 0; float *hostIn = 0; CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) ); CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) ); CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) ); CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) ); for ( size_t i = 0; i < N; i++ ) { hostIn[i] = (float) rand() / (float) RAND_MAX; } CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) ); { CUmodule moduleSAXPY; CUfunction kernelSAXPY; void *params[] = { &dptrOut, &dptrIn, &N, &alpha }; moduleSAXPY = chDevice->module( "saxpy.ptx" ); if ( ! moduleSAXPY ) { status = CUDA_ERROR_NOT_FOUND; goto Error; } CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) ); CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) ); } CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) ); CUDA_CHECK( cuCtxSynchronize() ); for ( size_t i = 0; i < N; i++ ) { if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) { status = CUDA_ERROR_UNKNOWN; goto Error; } } status = CUDA_SUCCESS; printf( "Well it worked!/n" );Error: cuCtxPopCurrent( NULL ); cuMemFreeHost( hostOut ); cuMemFreeHost( hostIn ); cuMemFree( dptrOut ); cuMemFree( dptrIn ); return status;}
开发者ID:AnilVarmaBiruduraju,项目名称:cudahandbook,代码行数:56,
示例28: CUDA_CHECKvoid Caffe::SetDevice(const int device_id) { root_device_ = device_id; CUDA_CHECK(cudaSetDevice(root_device_));}
开发者ID:Caffe-MPI,项目名称:Caffe-MPI.github.io,代码行数:4,
注:本文中的CUDA_CHECK函数示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 C++ CUDA_SAFE_CALL函数代码示例 C++ CUBLAS_CHECK函数代码示例 |