您当前的位置:首页 > IT编程 > C++
| C语言 | Java | VB | VC | python | Android | TensorFlow | C++ | oracle | 学术与代码 | cnn卷积神经网络 | gnn | 图像修复 | Keras | 数据集 | Neo4j | 自然语言处理 | 深度学习 | 医学CAD | 医学影像 | 超参数 | pointnet | pytorch | 异常检测 | Transformers | 情感分类 | 知识图谱 |

自学教程:C++ CUDA_CHECK函数代码示例

51自学网 2021-06-01 20:07:13
  C++
这篇教程C++ CUDA_CHECK函数代码示例写得很实用,希望能帮到您。

本文整理汇总了C++中CUDA_CHECK函数的典型用法代码示例。如果您正苦于以下问题:C++ CUDA_CHECK函数的具体用法?C++ CUDA_CHECK怎么用?C++ CUDA_CHECK使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了CUDA_CHECK函数的28个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。

示例1: StopInternalThread

void InternalThread::StartInternalThread() {  // TODO switch to failing once Caffe prefetch thread is persistent.  // Threads should not be started and stopped repeatedly.  // CHECK(!is_started());  StopInternalThread();#ifndef CPU_ONLY  CUDA_CHECK(cudaGetDevice(&device_));#endif  mode_ = Caffe::mode();  rand_seed_ = caffe_rng_rand();  solver_count_ = Caffe::solver_count();  root_solver_ = Caffe::root_solver();  try {    thread_.reset(new boost::thread(&InternalThread::entry, this));  } catch (std::exception& e) {    CHECK(false) << e.what();  }}
开发者ID:XinLiuNvidia,项目名称:caffe,代码行数:20,


示例2: transform

        void transform(Param<T> out, CParam<T> in, CParam<float> tf,                       const bool inverse)        {            const dim_type nimages = in.dims[2];            // Multiplied in src/backend/transform.cpp            const dim_type ntransforms = out.dims[2] / in.dims[2];            // Copy transform to constant memory.            CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,                                          cudaMemcpyDeviceToDevice));            dim3 threads(TX, TY, 1);            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));            if (nimages > 1)     { blocks.x *= nimages;   }            if (ntransforms > 1) { blocks.y *= ntransforms; }            if(inverse) {                transform_kernel<T, true><<<blocks, threads>>>(out, in, nimages, ntransforms);            } else {
开发者ID:EasonYi,项目名称:arrayfire,代码行数:20,


示例3: LOG

float Timer::MilliSeconds() {  if (!has_run_at_least_once()) {    LOG(WARNING) << "Timer has never been run before reading time.";    return 0;  }  if (running()) {    Stop();  }  if (Caffe::mode() == Caffe::GPU) {#ifndef CPU_ONLY    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,                                    stop_gpu_));#else      NO_GPU;#endif  } else {    elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();  }  return elapsed_milliseconds_;}
开发者ID:azrael417,项目名称:caffe,代码行数:20,


示例4: CUDA_CHECK

void CuDNNConvolutionLayer<Dtype>::LayerSetUp(    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);  // Initialize CUDA streams and cuDNN.  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {    CUDA_CHECK(cudaStreamCreate(&stream_[g]));    CUDNN_CHECK(cudnnCreate(&handle_[g]));    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));  }  // Set the indexing parameters.  weight_offset_ = (this->num_output_ / this->group_)      * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;  bias_offset_ = (this->num_output_ / this->group_);  // Create filter descriptor.  cudnn::createFilterDesc<Dtype>(&filter_desc_,      this->num_output_ / this->group_, this->channels_ / this->group_,      this->kernel_h_, this->kernel_w_);  // Create tensor descriptor(s) for data and corresponding convolution(s).  for (int i = 0; i < bottom.size(); i++) {    cudnnTensor4dDescriptor_t bottom_desc;    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);    bottom_descs_.push_back(bottom_desc);    cudnnTensor4dDescriptor_t top_desc;    cudnn::createTensor4dDesc<Dtype>(&top_desc);    top_descs_.push_back(top_desc);    cudnnConvolutionDescriptor_t conv_desc;    cudnn::createConvolutionDesc<Dtype>(&conv_desc);    conv_descs_.push_back(conv_desc);  }  // Tensor descriptor for bias.  if (this->bias_term_) {    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);  }}
开发者ID:13331151,项目名称:deeplab-public,代码行数:41,


示例5: morph

Array<T> morph(const Array<T> &in, const Array<T> &mask) {    const dim4 mdims = mask.dims();    if (mdims[0] != mdims[1])        CUDA_NOT_SUPPORTED("Rectangular masks are not supported");    if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported");    Array<T> out = createEmptyArray<T>(in.dims());    CUDA_CHECK(cudaMemcpyToSymbolAsync(        kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0,        cudaMemcpyDeviceToDevice, cuda::getActiveStream()));    if (isDilation)        kernel::morph<T, true>(out, in, mdims[0]);    else        kernel::morph<T, false>(out, in, mdims[0]);    return out;}
开发者ID:9prady9,项目名称:arrayfire,代码行数:21,


示例6: caffe_copy

void caffe_copy(const int N, const Dtype* X, Dtype* Y) {  if (X != Y) {    // If there are more than one openmp thread (we are in active region)    // then checking Caffe::mode can create additional GPU Context    //    if (#ifdef _OPENMP        (omp_in_parallel() == 0) &&#endif        (Caffe::mode() == Caffe::GPU)) {#ifndef CPU_ONLY      // NOLINT_NEXT_LINE(caffe/alt_fn)      CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));#else      NO_GPU;#endif    } else {      caffe_cpu_copy<Dtype>(N, X, Y);    }  }}
开发者ID:crobertob,项目名称:caffe,代码行数:21,


示例7: switch

// 把数据放到cpu上inline void SyncedMemory::to_cpu() {  switch (head_) {  case UNINITIALIZED:    CaffeMallocHost(&cpu_ptr_, size_);    memset(cpu_ptr_, 0, size_);    head_ = HEAD_AT_CPU;    own_cpu_data_ = true;    break;  case HEAD_AT_GPU:    if (cpu_ptr_ == NULL) {      CaffeMallocHost(&cpu_ptr_, size_);      own_cpu_data_ = true;    }    CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));    head_ = SYNCED;    break;  case HEAD_AT_CPU:  case SYNCED:    break;  }}
开发者ID:clarencezhang,项目名称:caffe-windows-multilabels,代码行数:22,


示例8: normalizeGPULaunch

/*// Launch GPU kernel of normalize//// API// int normalizeGPULaunch(const int alfa, CvLSVMFeatureMapGPU *dev_map_in,           CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,           CUstream stream);// INPUT// alfa// dev_map_in// dev_norm// stream// OUTPUT// dev_map_out// RESULT// Error status*/int normalizeGPULaunch(const float alfa, CvLSVMFeatureMapGPU *dev_map_in,        CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,        CUstream stream){    int sizeX, sizeY;    int thread_num_x, thread_num_y, thread_num_z;    int block_num_x, block_num_y, block_num_z;    int sharedMemBytes;    CUresult res;    sizeX = dev_map_in->sizeX;    sizeY = dev_map_in->sizeY;    void *normalize_kernel_arg[] =    { (void *) &dev_map_in->map, (void *) &dev_norm->map,            (void *) &dev_map_out->map, (void *) &sizeX, (void *) &sizeY,            (void *) &alfa, };    thread_num_x =            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);    thread_num_y =            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);    thread_num_z = 1;    block_num_x = sizeX / thread_num_x;    block_num_y = sizeY / thread_num_y;    block_num_z = NUM_SECTOR * 2;    if (sizeX % thread_num_x != 0)        block_num_x++;    if (sizeY % thread_num_y != 0)        block_num_y++;    sharedMemBytes = 0;    res = cuLaunchKernel(normalizeAndTruncate_func[0], block_num_x, block_num_y,            block_num_z, thread_num_x, thread_num_y, thread_num_z,            sharedMemBytes, stream, normalize_kernel_arg, NULL);    CUDA_CHECK(res, "cuLaunchKernel(normalizeAndTruncate)");    return LATENT_SVM_OK;}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:57,


示例9: PCAFeatureMapsAddNullableBorderGPULaunch

/*// Launch GPU kernel of PCA feature maps//// API// int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,           CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,           CUstream stream);// INPUT// dev_map_in// bx// by// stream// OUTPUT// dev_map_out// RESULT// Error status*/int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,        CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,        CUstream stream){    int sizeX, sizeY, p;    int thread_num_x, thread_num_y, thread_num_z;    int block_num_x, block_num_y, block_num_z;    int sharedMemBytes;    CUresult res;    sizeX = dev_map_in->sizeX;    sizeY = dev_map_in->sizeY;    p = dev_map_in->numFeatures;    void *pca_kernel_arg[] =    { (void *) &dev_map_in->map, (void *) &dev_map_out->map, (void *) &sizeX,            (void *) &sizeY, (void *) &p, (void *) &bx, (void *) &by };    thread_num_x =            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);    thread_num_y =            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);    thread_num_z = 1;    block_num_x = sizeX / thread_num_x;    block_num_y = sizeY / thread_num_y;    block_num_z = 1;    if (sizeX % thread_num_x != 0)        block_num_x++;    if (sizeY % thread_num_y != 0)        block_num_y++;    sharedMemBytes = 0;    res = cuLaunchKernel(PCAFeatureMapsAddNullableBorder_func[0], block_num_x,            block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z,            sharedMemBytes, stream, pca_kernel_arg, NULL);    CUDA_CHECK(res, "cuLaunchKernel(PCAFeatureMaps)");    return LATENT_SVM_OK;}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:57,


示例10: switch

inline void SyncedMemory::to_cpu() {  switch (head_) {  case UNINITIALIZED:    CaffeMallocHost(&cpu_ptr_, size_);    CHECK(cpu_ptr_ != 0) << "size " << size_;    memset(cpu_ptr_, 0, size_);    head_ = HEAD_AT_CPU;    break;#if 0  case HEAD_AT_GPU:    if (cpu_ptr_ == NULL) {      CaffeMallocHost(&cpu_ptr_, size_);    }    CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));    head_ = SYNCED;    break;#endif  case HEAD_AT_CPU:  case SYNCED:    break;  }}
开发者ID:Devy001,项目名称:Caffe-mini,代码行数:22,


示例11: pinnedAlloc

    T* pinnedAlloc(const size_t &elements)    {        managerInit();        T* ptr = NULL;        // Allocate the higher megabyte. Overhead of creating pinned memory is        // more so we want more resuable memory.        size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576;        if (elements > 0) {            // FIXME: Add better checks for garbage collection            // Perhaps look at total memory available as a metric            if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) {                pinnedGarbageCollect();            }            for(mem_iter iter = pinned_maps.begin();                iter != pinned_maps.end(); ++iter) {                mem_info info = iter->second;                if (info.is_free && info.bytes == alloc_bytes) {                    iter->second.is_free = false;                    pinned_used_bytes += alloc_bytes;                    return (T *)iter->first;                }            }            // Perform garbage collection if memory can not be allocated            if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) {                pinnedGarbageCollect();                CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes));            }            mem_info info = {false, false, alloc_bytes};            pinned_maps[ptr] = info;            pinned_used_bytes += alloc_bytes;        }        return (T*)ptr;    }
开发者ID:hxiaox,项目名称:arrayfire,代码行数:39,


示例12: LOG

float Timer::MicroSeconds() {  if (!has_run_at_least_once()) {    LOG(WARNING)<< "Timer has never been run before reading time.";    return 0;  }  if (running()) {    Stop();  }#ifdef USE_CUDA  if (Caffe::mode() == Caffe::GPU) {    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,            stop_gpu_));    // Cuda only measure milliseconds    elapsed_microseconds_ = elapsed_milliseconds_ * 1000;  } else {#endif    elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();#ifdef USE_CUDA  }#endif  return elapsed_microseconds_;}
开发者ID:rickyHong,项目名称:CaffeForOpenCL,代码行数:22,


示例13: calculateNormGPULaunch

/*// Launch GPU kernel of calculate norm//// API//int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,          CvLSVMFeatureMapGPU *dev_norm, CUstream stream)// INPUT// dev_map_in// stream// OUTPUT// dev_norm// RESULT// Error status*/int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,        CvLSVMFeatureMapGPU *dev_norm, CUstream stream){    int sizeX, sizeY, xp;    int thread_num_x, thread_num_y, thread_num_z;    int block_num_x, block_num_y, block_num_z;    int sharedMemBytes;    CUresult res;    sizeX = dev_map_in->sizeX;    sizeY = dev_map_in->sizeY;    xp = dev_map_in->numFeatures;    void *calc_norm_kernel_arg[] =    { (void *) &dev_map_in->map, (void *) &dev_norm->map, (void *) &sizeX,            (void *) &sizeY, (void *) &xp, };    thread_num_x =            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);    thread_num_y =            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);    thread_num_z = 1;    block_num_x = sizeX / thread_num_x;    block_num_y = sizeY / thread_num_y;    block_num_z = 1;    if (sizeX % thread_num_x != 0)        block_num_x++;    if (sizeY % thread_num_y != 0)        block_num_y++;    sharedMemBytes = 0;    res = cuLaunchKernel(calculateNorm_func[0], block_num_x, block_num_y,            block_num_z, thread_num_x, thread_num_y, thread_num_z,            sharedMemBytes, stream, calc_norm_kernel_arg, NULL);    CUDA_CHECK(res, "cuLaunchKernel(calcuateNorm)");    return LATENT_SVM_OK;}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:53,


示例14: CUDA_CHECK

void MPIComm::ThreadFunc(int device){#ifndef CPU_ONLY  //LOG(ERROR)<<"device_id is "<<device;  CUDA_CHECK(cudaSetDevice(device));#endif  started_.store(true);  MPIJob job;  while (true){    mutex::scoped_lock lock(queue_mutex_);    while( task_queue_.empty() && IsRunning()){      DLOG(INFO)<<"no job running, waiting on cond";      cond_work_.wait(lock);    }    lock.unlock();    DLOG(INFO)<<"Cond fulfilled, dispatching job";    if (IsRunning()){      job = task_queue_.front();      DLOG(INFO)<<task_queue_.size();      DispatchJob(job);      mutex::scoped_lock pop_lock(queue_mutex_);      task_queue_.pop();      pop_lock.unlock();      cond_finish_.notify_one();      DLOG(INFO)<<"job finished, poped taskqueue";    }else{      break;    }  }  // finish remaining jobs  while (!task_queue_.empty()){    boost::lock_guard<mutex> lock(queue_mutex_);    job = task_queue_.front();    task_queue_.pop();    DispatchJob(job);  }}
开发者ID:xiangqiaolxq,项目名称:caffe-parallel,代码行数:39,


示例15: morph

Array<T>  morph(const Array<T> &in, const Array<T> &mask){    const dim4 mdims = mask.dims();    if (mdims[0] != mdims[1])        AF_ERROR("Only square masks are supported in cuda morph currently", AF_ERR_SIZE);    if (mdims[0] > 19)        AF_ERROR("Upto 19x19 square kernels are only supported in cuda currently", AF_ERR_SIZE);    Array<T> out = createEmptyArray<T>(in.dims());    CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(),                                       mdims[0] * mdims[1] * sizeof(T),                                       0, cudaMemcpyDeviceToDevice,                                       cuda::getStream(cuda::getActiveDeviceId())));    if (isDilation)        kernel::morph<T, true >(out, in, mdims[0]);    else        kernel::morph<T, false>(out, in, mdims[0]);    return out;}
开发者ID:hxiaox,项目名称:arrayfire,代码行数:23,


示例16: memAlloc

    T* memAlloc(const size_t &elements)    {        int n = getActiveDeviceId();        T* ptr = NULL;        size_t alloc_bytes = divup(sizeof(T) * elements, 1024) * 1024;        if (elements > 0) {            // FIXME: Add better checks for garbage collection            // Perhaps look at total memory available as a metric            if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes >= MAX_BYTES) {                garbageCollect();            }            for(mem_iter iter = memory_maps[n].begin();                iter != memory_maps[n].end(); iter++) {                mem_info info = iter->second;                if (info.is_free && info.bytes == alloc_bytes) {                    iter->second.is_free = false;                    used_bytes += alloc_bytes;                    return (T *)iter->first;                }            }            // Perform garbage collection if memory can not be allocated            if (cudaMalloc((void **)&ptr, alloc_bytes) != cudaSuccess) {                garbageCollect();                CUDA_CHECK(cudaMalloc((void **)(&ptr), alloc_bytes));            }            mem_info info = {false, alloc_bytes};            memory_maps[n][ptr] = info;            used_bytes += alloc_bytes;        }        return ptr;    }
开发者ID:maolingao,项目名称:arrayfire,代码行数:37,


示例17: CUDA_CHECK

SocketBuffer* SocketBuffer::Read(bool data) {  // Pop the message from local queue  QueuedMessage* qm = NULL;  if(data) {    qm = reinterpret_cast<QueuedMessage*>      (this->channel_->receive_queue.pop());#ifndef CPU_ONLY    // Copy the received buffer to GPU memory    CUDA_CHECK(cudaMemcpy(this->addr(), qm->buffer,  // NOLINT(caffe/alt_fn)               qm->size, cudaMemcpyHostToDevice));  // NOLINT(caffe/alt_fn)#else    //caffe_copy(qm->size, qm->buffer, this->addr_);    memcpy(this->addr_, qm->buffer, qm->size);#endif  } else {    qm = reinterpret_cast<QueuedMessage*>      (this->channel_->receive_queue_ctrl.pop());  }  // Free up the buffer and the wrapper object  if(data)    delete qm->buffer;  delete qm;  return this;}
开发者ID:Aravindreddy986,项目名称:CaffeOnSpark,代码行数:24,


示例18: CUDA_CHECK

P2PSync<Dtype>::~P2PSync() {#ifndef CPU_ONLY    int initial_device;    CUDA_CHECK(cudaGetDevice(&initial_device));    const int self = solver_->param().device_id();    CUDA_CHECK(cudaSetDevice(self));    if (parent_) {        CUDA_CHECK(cudaFree(parent_grads_));        const int peer = parent_->solver_->param().device_id();        int access;        CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));        if (access) {            CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));        }    }    CUDA_CHECK(cudaSetDevice(initial_device));#endif}
开发者ID:flair2005,项目名称:Caffe-Solution,代码行数:20,


示例19: remaining

void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {#ifndef CPU_ONLY    vector<int> remaining(devices);    // Depth for reduction tree    int remaining_depth = static_cast<int>(ceil(log2(remaining.size())));    // Group GPUs by board    for (int d = 0; d < remaining_depth; ++d) {        for (int i = 0; i < remaining.size(); ++i) {            for (int j = i + 1; j < remaining.size(); ++j) {                cudaDeviceProp a, b;                CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i]));                CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j]));                if (a.isMultiGpuBoard && b.isMultiGpuBoard) {                    if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) {                        pairs->push_back(DevicePair(remaining[i], remaining[j]));                        DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j];                        remaining.erase(remaining.begin() + j);                        break;                    }                }            }        }    }    ostringstream s;    for (int i = 0; i < remaining.size(); ++i) {        s << (i ? ", " : "") << remaining[i];    }    DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str();    // Group by P2P accessibility    remaining_depth = ceil(log2(remaining.size()));    for (int d = 0; d < remaining_depth; ++d) {        for (int i = 0; i < remaining.size(); ++i) {            for (int j = i + 1; j < remaining.size(); ++j) {                int access;                CUDA_CHECK(                    cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j]));                if (access) {                    pairs->push_back(DevicePair(remaining[i], remaining[j]));                    DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j];                    remaining.erase(remaining.begin() + j);                    break;                }            }        }    }    s.str("");    for (int i = 0; i < remaining.size(); ++i) {        s << (i ? ", " : "") << remaining[i];    }    DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str();    // Group remaining    remaining_depth = ceil(log2(remaining.size()));    for (int d = 0; d < remaining_depth; ++d) {        for (int i = 0; i < remaining.size(); ++i) {            pairs->push_back(DevicePair(remaining[i], remaining[i + 1]));            DLOG(INFO) << "Remaining pair: " << remaining[i] << ":"                       << remaining[i + 1];            remaining.erase(remaining.begin() + i + 1);        }    }    // Should only be the parent node remaining    CHECK_EQ(remaining.size(), 1);    pairs->insert(pairs->begin(), DevicePair(-1, remaining[0]));    CHECK(pairs->size() == devices.size());    for (int i = 0; i < pairs->size(); ++i) {        CHECK((*pairs)[i].parent() != (*pairs)[i].device());        for (int j = i + 1; j < pairs->size(); ++j) {            CHECK((*pairs)[i].device() != (*pairs)[j].device());        }    }#else    NO_GPU;#endif}
开发者ID:flair2005,项目名称:Caffe-Solution,代码行数:81,


示例20: orb

void orb(unsigned* out_feat,         float** d_x,         float** d_y,         float** d_score,         float** d_ori,         float** d_size,         unsigned** d_desc,         std::vector<unsigned>& feat_pyr,         std::vector<float*>& d_x_pyr,         std::vector<float*>& d_y_pyr,         std::vector<unsigned>& lvl_best,         std::vector<float>& lvl_scl,         std::vector<CParam<T> >& img_pyr,         const float fast_thr,         const unsigned max_feat,         const float scl_fctr,         const unsigned levels){    unsigned patch_size = REF_PAT_SIZE;    unsigned max_levels = feat_pyr.size();    // In future implementations, the user will be capable of passing his    // distribution instead of using the reference one    //CUDA_CHECK(cudaMemcpyToSymbol(d_ref_pat, h_ref_pat, 256 * 4 * sizeof(int), 0, cudaMemcpyHostToDevice));    std::vector<float*> d_score_pyr(max_levels);    std::vector<float*> d_ori_pyr(max_levels);    std::vector<float*> d_size_pyr(max_levels);    std::vector<unsigned*> d_desc_pyr(max_levels);    std::vector<unsigned*> d_idx_pyr(max_levels);    unsigned total_feat = 0;    // Calculate a separable Gaussian kernel    unsigned gauss_len = 9;    convAccT* h_gauss = new convAccT[gauss_len];    gaussian1D(h_gauss, gauss_len, 2.f);    Param<convAccT> gauss_filter;    gauss_filter.dims[0] = gauss_len;    gauss_filter.strides[0] = 1;    for (int k = 1; k < 4; k++) {        gauss_filter.dims[k] = 1;        gauss_filter.strides[k] = gauss_filter.dims[k - 1] * gauss_filter.strides[k - 1];    }    dim_type gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3];    gauss_filter.ptr = memAlloc<convAccT>(gauss_elem);    CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(convAccT), cudaMemcpyHostToDevice));    delete[] h_gauss;    for (int i = 0; i < (int)max_levels; i++) {        if (feat_pyr[i] == 0 || lvl_best[i] == 0) {            if (i > 0)                memFree((T*)img_pyr[i].ptr);            continue;        }        unsigned* d_usable_feat = memAlloc<unsigned>(1);        CUDA_CHECK(cudaMemset(d_usable_feat, 0, sizeof(unsigned)));        float* d_x_harris = memAlloc<float>(feat_pyr[i]);        float* d_y_harris = memAlloc<float>(feat_pyr[i]);        float* d_score_harris = memAlloc<float>(feat_pyr[i]);        // Calculate Harris responses        // Good block_size >= 7 (must be an odd number)        dim3 threads(THREADS_X, THREADS_Y);        dim3 blocks(divup(feat_pyr[i], threads.x), 1);        harris_response<T,false><<<blocks, threads>>>(d_x_harris, d_y_harris, d_score_harris, NULL,                                                      d_x_pyr[i], d_y_pyr[i], NULL,                                                      feat_pyr[i], d_usable_feat,                                                      img_pyr[i], 7, 0.04f, patch_size);        POST_LAUNCH_CHECK();        unsigned usable_feat = 0;        CUDA_CHECK(cudaMemcpy(&usable_feat, d_usable_feat, sizeof(unsigned), cudaMemcpyDeviceToHost));        memFree(d_x_pyr[i]);        memFree(d_y_pyr[i]);        memFree(d_usable_feat);        feat_pyr[i] = usable_feat;        if (feat_pyr[i] == 0) {            memFree(d_x_harris);            memFree(d_y_harris);            memFree(d_score_harris);            if (i > 0)                memFree((T*)img_pyr[i].ptr);            continue;        }        Param<float> harris_sorted;        Param<unsigned> harris_idx;        harris_sorted.dims[0] = harris_idx.dims[0] = feat_pyr[i];        harris_sorted.strides[0] = harris_idx.strides[0] = 1;//.........这里部分代码省略.........
开发者ID:pavanky,项目名称:arrayfire,代码行数:101,


示例21: getFeatureMapsGPUStream

/*// Getting feature map for the selected subimage in GPU//// API//int getFeatureMapsGPUStream(const int numStep, const int k,          CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map,          CUstream *streams)// INPUT// numStep// k// devs_img// streams// OUTPUT// devs_map// RESULT// Error status*/int getFeatureMapsGPUStream(const int numStep, const int k,        CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map,        CUstream *streams){    int sizeX, sizeY;    int p, px;    int height, width;    int i, j;    int *nearest;    float *w, a_x, b_x;    int size_r, size_alfa, size_nearest, size_w, size_map;    CUresult res;    CvLSVMFeatureMapGPU **devs_r, **devs_alfa;    CUdeviceptr dev_nearest, dev_w;    px = 3 * NUM_SECTOR;    p = px;    size_nearest = k;    size_w = k * 2;    devs_r = (CvLSVMFeatureMapGPU **) malloc(            sizeof(CvLSVMFeatureMapGPU*) * numStep);    devs_alfa = (CvLSVMFeatureMapGPU **) malloc(            sizeof(CvLSVMFeatureMapGPU*) * numStep);    nearest = (int *) malloc(sizeof(int) * size_nearest);    w = (float *) malloc(sizeof(float) * size_w);    // initialize "nearest" and "w"    for (i = 0; i < k / 2; i++)    {        nearest[i] = -1;    }/*for(i = 0; i < k / 2; i++)*/    for (i = k / 2; i < k; i++)    {        nearest[i] = 1;    }/*for(i = k / 2; i < k; i++)*/    for (j = 0; j < k / 2; j++)    {        b_x = k / 2 + j + 0.5f;        a_x = k / 2 - j - 0.5f;        w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x));        w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x));    }/*for(j = 0; j < k / 2; j++)*/    for (j = k / 2; j < k; j++)    {        a_x = j - k / 2 + 0.5f;        b_x = -j + k / 2 - 0.5f + k;        w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x));        w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x));    }/*for(j = k / 2; j < k; j++)*/    res = cuMemAlloc(&dev_nearest, sizeof(int) * size_nearest);    CUDA_CHECK(res, "cuMemAlloc(dev_nearest)");    res = cuMemAlloc(&dev_w, sizeof(float) * size_w);    CUDA_CHECK(res, "cuMemAlloc(dev_w)");    res = cuMemcpyHtoDAsync(dev_nearest, nearest, sizeof(int) * size_nearest,            streams[numStep - 1]);    res = cuMemcpyHtoDAsync(dev_w, w, sizeof(float) * size_w,            streams[numStep - 1]);    // allocate device memory    for (i = 0; i < numStep; i++)    {        width = devs_img[i]->sizeX;        height = devs_img[i]->sizeY;        allocFeatureMapObjectGPU<float>(&devs_r[i], width, height, 1);        allocFeatureMapObjectGPU<int>(&devs_alfa[i], width, height, 2);    }    // excute async    for (i = 0; i < numStep; i++)    {        // initialize "map", "r" and "alfa"        width = devs_img[i]->sizeX;        height = devs_img[i]->sizeY;        sizeX = width / k;//.........这里部分代码省略.........
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:101,


示例22: PCAFeatureMapsGPUStream

/*// Feature map reduction in GPU// In each cell we reduce dimension of the feature vector// according to original paper special procedure//// API//int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by,          CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps,          CUstream *streams)// INPUT// numStep// bx// by// devs_map_in// streams// OUTPUT// feature_maps// RESULT// Error status*/int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by,        CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps,        CUstream *streams){    int sizeX, sizeY, pp;    int size_map_pca;    int i;    CUresult res;    CvLSVMFeatureMapGPU **devs_map_pca;    pp = NUM_SECTOR * 3 + 4;    devs_map_pca = (CvLSVMFeatureMapGPU **) malloc(            sizeof(CvLSVMFeatureMapGPU*) * (numStep));    // allocate memory    for (i = 0; i < numStep; i++)    {        sizeX = devs_map_in[i]->sizeX + 2 * bx;        sizeY = devs_map_in[i]->sizeY + 2 * by;        size_map_pca = sizeX * sizeY * pp;        allocFeatureMapObject(&feature_maps[i], sizeX, sizeY, pp);        allocFeatureMapObjectGPU<float>(&devs_map_pca[i], sizeX, sizeY, pp);    }    // exucute async    for (i = 0; i < numStep; i++)    {        sizeX = devs_map_pca[i]->sizeX;        sizeY = devs_map_pca[i]->sizeY;        size_map_pca = sizeX * sizeY * pp;        // initilize device memory value of 0        res = cuMemsetD32Async(devs_map_pca[i]->map, 0, size_map_pca,                streams[i]);        CUDA_CHECK(res, "cuMemset(dev_map_pca)");        // launch kernel        PCAFeatureMapsAddNullableBorderGPULaunch(devs_map_in[i],                devs_map_pca[i], bx, by, streams[i]);    }    for (i = 0; i < numStep; i++)    {        sizeX = devs_map_pca[i]->sizeX;        sizeY = devs_map_pca[i]->sizeY;        size_map_pca = sizeX * sizeY * pp;        // copy memory from device to host        res = cuMemcpyDtoHAsync(feature_maps[i]->map, devs_map_pca[i]->map,                sizeof(float) * size_map_pca, streams[i]);        CUDA_CHECK(res, "cuMemcpyDtoH(dev_map_pca)");    }    // free device memory    for (i = 0; i < numStep; i++)    {        freeFeatureMapObjectGPU(&devs_map_pca[i]);    }    free(devs_map_pca);    return LATENT_SVM_OK;}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:87,


示例23: normalizeAndTruncateGPUStream

/*// Feature map Normalization and Truncation in GPU//// API//int normalizeAndTruncateGPUStream(const int numStep, const float alfa,          CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out,          CUstream *streams)// INPUT// numStep// alfa// devs_map_in// streams// OUTPUT// devs_map_out// RESULT// Error status*/int normalizeAndTruncateGPUStream(const int numStep, const float alfa,        CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out,        CUstream *streams){    int sizeX, sizeY, newSizeX, newSizeY, pp;    int size_norm, size_map_out;    int i;    CUresult res;    CvLSVMFeatureMapGPU **devs_norm;    pp = NUM_SECTOR * 12;    devs_norm = (CvLSVMFeatureMapGPU **) malloc(            sizeof(CvLSVMFeatureMapGPU*) * (numStep));    // allocate device memory    for (i = 0; i < numStep; i++)    {        sizeX = devs_map_in[i]->sizeX;        sizeY = devs_map_in[i]->sizeY;        newSizeX = sizeX - 2;        newSizeY = sizeY - 2;        allocFeatureMapObjectGPU<float>(&devs_norm[i], sizeX, sizeY, 1);    }    // exucute async    for (i = 0; i < numStep; i++)    {        sizeX = devs_map_in[i]->sizeX;        sizeY = devs_map_in[i]->sizeY;        newSizeX = sizeX - 2;        newSizeY = sizeY - 2;        size_norm = sizeX * sizeY;        size_map_out = newSizeX * newSizeY * pp;        // initilize device memory value of 0        res = cuMemsetD32Async(devs_norm[i]->map, 0, size_norm, streams[i]);        CUDA_CHECK(res, "cuMemset(dev_norm)");        res = cuMemsetD32Async(devs_map_out[i]->map, 0, size_map_out,                streams[i]);        CUDA_CHECK(res, "cuMemset(dev_map_out)");        // launch kernel        calculateNormGPULaunch(devs_map_in[i], devs_norm[i], streams[i]);    }    for (i = 0; i < numStep; i++)    {        // launch kernel        normalizeGPULaunch(alfa, devs_map_in[i], devs_norm[i], devs_map_out[i],                streams[i]);    }    // synchronize cuda stream    for (i = 0; i < numStep; i++)    {        cuStreamSynchronize(streams[i]);    }    // free device memory    for (i = 0; i < numStep; i++)    {        freeFeatureMapObjectGPU(&devs_norm[i]);    }    free(devs_norm);    return LATENT_SVM_OK;}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:89,


示例24: CHECK_GT

void MultiStageMeanfieldLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,      const vector<Blob<Dtype>*>& top) {  init_cpu = false;  init_gpu = false;  const caffe::MultiStageMeanfieldParameter meanfield_param = this->layer_param_.multi_stage_meanfield_param();  num_iterations_ = meanfield_param.num_iterations();  CHECK_GT(num_iterations_, 1) << "Number of iterations must be greater than 1.";  theta_alpha_ = meanfield_param.theta_alpha();  theta_beta_ = meanfield_param.theta_beta();  theta_gamma_ = meanfield_param.theta_gamma();  count_ = bottom[0]->count();  num_ = bottom[0]->num();  channels_ = bottom[0]->channels();  height_ = bottom[0]->height();  width_ = bottom[0]->width();  num_pixels_ = height_ * width_;  LOG(INFO) << "This implementation has not been tested batch size > 1.";  top[0]->Reshape(num_, channels_, height_, width_);  // Initialize the parameters that will updated by backpropagation.  if (this->blobs_.size() > 0) {    LOG(INFO) << "Multimeanfield layer skipping parameter initialization.";  } else {    this->blobs_.resize(3);// blobs_[0] - spatial kernel weights, blobs_[1] - bilateral kernel weights, blobs_[2] - compatability matrix    // Allocate space for kernel weights.    this->blobs_[0].reset(new Blob<Dtype>(1, 1, channels_, channels_));    this->blobs_[1].reset(new Blob<Dtype>(1, 1, channels_, channels_));    caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[0]->mutable_cpu_data());    caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[1]->mutable_cpu_data());    // Initialize the kernels weights. The two files spatial.par and bilateral.par should be available.    FILE * pFile;    pFile = fopen("spatial.par", "r");    CHECK(pFile) << "The file 'spatial.par' is not found. Please create it with initial spatial kernel weights.";    for (int i = 0; i < channels_; i++) {      fscanf(pFile, "%lf", &this->blobs_[0]->mutable_cpu_data()[i * channels_ + i]);    }    fclose(pFile);    pFile = fopen("bilateral.par", "r");    CHECK(pFile) << "The file 'bilateral.par' is not found. Please create it with initial bilateral kernel weights.";    for (int i = 0; i < channels_; i++) {      fscanf(pFile, "%lf", &this->blobs_[1]->mutable_cpu_data()[i * channels_ + i]);    }    fclose(pFile);    // Initialize the compatibility matrix.    this->blobs_[2].reset(new Blob<Dtype>(1, 1, channels_, channels_));    caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[2]->mutable_cpu_data());    // Initialize it to have the Potts model.    for (int c = 0; c < channels_; ++c) {      (this->blobs_[2]->mutable_cpu_data())[c * channels_ + c] = Dtype(-1.);    }  }  float spatial_kernel[2 * num_pixels_];  float *spatial_kernel_gpu_;  compute_spatial_kernel(spatial_kernel);  spatial_lattice_.reset(new ModifiedPermutohedral());  spatial_norm_.Reshape(1, 1, height_, width_);  Dtype* norm_data_gpu ;  Dtype*  norm_data;  // Initialize the spatial lattice. This does not need to be computed for every image because we use a fixed size.  switch (Caffe::mode()) {    case Caffe::CPU:      norm_data = spatial_norm_.mutable_cpu_data();      spatial_lattice_->init(spatial_kernel, 2, width_, height_);      // Calculate spatial filter normalization factors.      norm_feed_= new Dtype[num_pixels_];      caffe_set(num_pixels_, Dtype(1.0), norm_feed_);      // pass norm_feed and norm_data to gpu      spatial_lattice_->compute(norm_data, norm_feed_, 1);      bilateral_kernel_buffer_ = new float[5 * num_pixels_];      init_cpu = true;      break;    #ifndef CPU_ONLY    case Caffe::GPU:      CUDA_CHECK(cudaMalloc((void**)&spatial_kernel_gpu_, 2*num_pixels_ * sizeof(float))) ;      CUDA_CHECK(cudaMemcpy(spatial_kernel_gpu_, spatial_kernel, 2*num_pixels_ * sizeof(float), cudaMemcpyHostToDevice)) ;      spatial_lattice_->init(spatial_kernel_gpu_, 2, width_, height_);      CUDA_CHECK(cudaMalloc((void**)&norm_feed_, num_pixels_ * sizeof(Dtype))) ;      caffe_gpu_set(num_pixels_, Dtype(1.0), norm_feed_);      norm_data_gpu = spatial_norm_.mutable_gpu_data();      spatial_lattice_->compute(norm_data_gpu, norm_feed_, 1);       norm_data = spatial_norm_.mutable_cpu_data();      CUDA_CHECK(cudaMalloc((void**)&bilateral_kernel_buffer_, 5 * num_pixels_ * sizeof(float))) ;      CUDA_CHECK(cudaFree(spatial_kernel_gpu_));      init_gpu = true;      break;//.........这里部分代码省略.........
开发者ID:hyenal,项目名称:crfasrnn,代码行数:101,


示例25: getPathOfFeaturePyramidGPUStream

/*// Property Message//// API//static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step,          int numStep, int startIndex, int sideLength, int bx, int by,          CvLSVMFeaturePyramid **maps)// INPUT// image// step// numStep// startIndex// sideLength// bx// by// OUTPUT// maps// RESULT// Error status*/static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step,        int numStep, int startIndex, int sideLength, int bx, int by,        CvLSVMFeaturePyramid **maps){    CvLSVMFeatureMap **feature_maps;    int i;    int width, height, numChannels, sizeX, sizeY, p, pp, newSizeX, newSizeY;    float *scales;    CvLSVMFeatureMapGPU **devs_img, **devs_map_pre_norm, **devs_map_pre_pca;    CUstream *streams;    CUresult res;    scales = (float *) malloc(sizeof(float) * (numStep));    devs_img = (CvLSVMFeatureMapGPU **) malloc(            sizeof(CvLSVMFeatureMapGPU*) * (numStep));    devs_map_pre_norm = (CvLSVMFeatureMapGPU **) malloc(            sizeof(CvLSVMFeatureMapGPU*) * (numStep));    devs_map_pre_pca = (CvLSVMFeatureMapGPU **) malloc(            sizeof(CvLSVMFeatureMapGPU*) * (numStep));    streams = (CUstream *) malloc(sizeof(CUstream) * (numStep));    feature_maps = (CvLSVMFeatureMap **) malloc(            sizeof(CvLSVMFeatureMap *) * (numStep));    // allocate device memory    for (i = 0; i < numStep; i++)    {        scales[i] = 1.0f / powf(step, (float) i);        width  = (int) (((float) image->width ) * scales[i] + 0.5);        height = (int) (((float) image->height) * scales[i] + 0.5);        numChannels = image->nChannels;        sizeX = width  / sideLength;        sizeY = height / sideLength;        p  = NUM_SECTOR * 3;        pp = NUM_SECTOR * 12;        newSizeX = sizeX - 2;        newSizeY = sizeY - 2;        allocFeatureMapObjectGPU<float>(&devs_img[i], width, height,                numChannels);        allocFeatureMapObjectGPU<float>(&devs_map_pre_norm[i], sizeX, sizeY, p);        allocFeatureMapObjectGPU<float>(&devs_map_pre_pca[i], newSizeX,                newSizeY, pp);        res = cuStreamCreate(&streams[i], CU_STREAM_DEFAULT);        CUDA_CHECK(res, "cuStreamCreate(stream)");    }    // excute main function    resizeGPUStream(numStep, image, scales, devs_img, streams);    getFeatureMapsGPUStream(numStep, sideLength, devs_img, devs_map_pre_norm,            streams);    normalizeAndTruncateGPUStream(numStep, Val_Of_Truncate, devs_map_pre_norm,            devs_map_pre_pca, streams);    PCAFeatureMapsGPUStream(numStep, bx, by, devs_map_pre_pca, feature_maps,            streams);    // synchronize cuda stream    for (i = 0; i < numStep; i++)    {        cuStreamSynchronize(streams[i]);        cuStreamDestroy(streams[i]);    }    for (i = 0; i < numStep; i++)    {        (*maps)->pyramid[startIndex + i] = feature_maps[i];    }/*for(i = 0; i < numStep; i++)*/    // free device memory    for (i = 0; i < numStep; i++)    {        freeFeatureMapObjectGPU(&devs_img[i]);        freeFeatureMapObjectGPU(&devs_map_pre_norm[i]);        freeFeatureMapObjectGPU(&devs_map_pre_pca[i]);    }    free(scales);//.........这里部分代码省略.........
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:101,


示例26: pocl_cuda_alloc_mem_obj

cl_intpocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr){  cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);  CUresult result;  void *b = NULL;  /* if memory for this global memory is not yet allocated -> do it */  if (mem_obj->device_ptrs[device->global_mem_id].mem_ptr == NULL)    {      cl_mem_flags flags = mem_obj->flags;      if (flags & CL_MEM_USE_HOST_PTR)        {#if defined __arm__          // cuMemHostRegister is not supported on ARN          // Allocate device memory and perform explicit copies          // before and after running a kernel          result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);          CUDA_CHECK (result, "cuMemAlloc");#else          result = cuMemHostRegister (host_ptr, mem_obj->size,                                      CU_MEMHOSTREGISTER_DEVICEMAP);          if (result != CUDA_SUCCESS              && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)            CUDA_CHECK (result, "cuMemHostRegister");          result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, host_ptr, 0);          CUDA_CHECK (result, "cuMemHostGetDevicePointer");#endif        }      else if (flags & CL_MEM_ALLOC_HOST_PTR)        {          result = cuMemHostAlloc (&mem_obj->mem_host_ptr, mem_obj->size,                                   CU_MEMHOSTREGISTER_DEVICEMAP);          CUDA_CHECK (result, "cuMemHostAlloc");          result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b,                                              mem_obj->mem_host_ptr, 0);          CUDA_CHECK (result, "cuMemHostGetDevicePointer");        }      else        {          result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);          if (result != CUDA_SUCCESS)            {              const char *err;              cuGetErrorName (result, &err);              POCL_MSG_PRINT2 (__FUNCTION__, __LINE__,                               "-> Failed to allocate memory: %s/n", err);              return CL_MEM_OBJECT_ALLOCATION_FAILURE;            }        }      if (flags & CL_MEM_COPY_HOST_PTR)        {          result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem_obj->size);          CUDA_CHECK (result, "cuMemcpyHtoD");        }      mem_obj->device_ptrs[device->global_mem_id].mem_ptr = b;      mem_obj->device_ptrs[device->global_mem_id].global_mem_id          = device->global_mem_id;    }  /* copy already allocated global mem info to devices own slot */  mem_obj->device_ptrs[device->dev_id]      = mem_obj->device_ptrs[device->global_mem_id];  return CL_SUCCESS;}
开发者ID:jrprice,项目名称:pocl,代码行数:70,


示例27: TestSAXPY

CUresultTestSAXPY( chCUDADevice *chDevice, size_t N, float alpha ){    CUresult status;    CUdeviceptr dptrOut = 0;    CUdeviceptr dptrIn = 0;    float *hostOut = 0;    float *hostIn = 0;    CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) );    CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) );    CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) );    CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) );    CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) );    CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) );    for ( size_t i = 0; i < N; i++ ) {        hostIn[i] = (float) rand() / (float) RAND_MAX;    }    CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) );    {        CUmodule moduleSAXPY;        CUfunction kernelSAXPY;        void *params[] = { &dptrOut, &dptrIn, &N, &alpha };                moduleSAXPY = chDevice->module( "saxpy.ptx" );        if ( ! moduleSAXPY ) {            status = CUDA_ERROR_NOT_FOUND;            goto Error;        }        CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) );        CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) );    }    CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) );    CUDA_CHECK( cuCtxSynchronize() );    for ( size_t i = 0; i < N; i++ ) {        if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) {            status = CUDA_ERROR_UNKNOWN;            goto Error;        }    }    status = CUDA_SUCCESS;    printf( "Well it worked!/n" );Error:    cuCtxPopCurrent( NULL );    cuMemFreeHost( hostOut );    cuMemFreeHost( hostIn );    cuMemFree( dptrOut );    cuMemFree( dptrIn );    return status;}
开发者ID:AnilVarmaBiruduraju,项目名称:cudahandbook,代码行数:56,


示例28: CUDA_CHECK

void Caffe::SetDevice(const int device_id) {  root_device_ = device_id;  CUDA_CHECK(cudaSetDevice(root_device_));}
开发者ID:Caffe-MPI,项目名称:Caffe-MPI.github.io,代码行数:4,



注:本文中的CUDA_CHECK函数示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


C++ CUDA_SAFE_CALL函数代码示例
C++ CUBLAS_CHECK函数代码示例
万事OK自学网:51自学网_软件自学网_CAD自学网自学excel、自学PS、自学CAD、自学C语言、自学css3实例,是一个通过网络自主学习工作技能的自学平台,网友喜欢的软件自学网站。