#include #include #include #include "caffe/layers/batch_reindex_layer.hpp" #include "caffe/util/math_functions.hpp" namespace caffe { template __global__ void BRForward(const int count, const int inner_dim, const Dtype* in, const Dtype* permut, Dtype* out) { CUDA_KERNEL_LOOP(index, count) { int n = index / (inner_dim); int in_n = static_cast(permut[n]); out[index] = in[in_n * (inner_dim) + index % (inner_dim)]; } } template void BatchReindexLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { check_batch_reindex(bottom[0]->shape(0), bottom[1]->count(), bottom[1]->cpu_data()); if (top[0]->count() == 0) { return; } int threads = top[0]->count(); // NOLINT_NEXT_LINE(whitespace/operators) BRForward <<>>( top[0]->count(), bottom[0]->count() / bottom[0]->shape(0), bottom[0]->gpu_data(), bottom[1]->gpu_data(), top[0]->mutable_gpu_data()); CUDA_POST_KERNEL_CHECK; } template __global__ void BRBackward(const int count, const int inner_dim, const Dtype* in, const Dtype* top_indexes, const Dtype* begins, const Dtype* counts, Dtype* out) { CUDA_KERNEL_LOOP(index, count) { int n = index / (inner_dim); out[index] = 0; int lower = static_cast(begins[n]); int upper = lower + static_cast(counts[n]); for (int i = lower; i < upper; ++i) { int in_n = static_cast(top_indexes[i]); out[index] += in[in_n * (inner_dim) + index % (inner_dim)]; } } } template void BatchReindexLayer::Backward_gpu( const vector*>& top, const vector& propagate_down, const vector*>& bottom) { CHECK(!propagate_down[1]) << "Cannot backprop to index."; if (!propagate_down[0]) { return; } vector > mapping; const Dtype* perm = bottom[1]->cpu_data(); for (int i = 0; i < bottom[1]->count(); ++i) { mapping.push_back(pair(static_cast(perm[i]), i)); } std::sort(mapping.begin(), mapping.end(), pair_sort_first()); // Each element of the bottom diff is potentially the sum of many top diffs. // However, we'd like each CUDA thread to handle exactly one output. Hence, // we first pre-compute a list of lists of indices that need to be summed for // each output. `top_indexes` holds the data of this list of lists. The // k'th element of `begins` points to the location in `top_indexes` where the // list for the k'th example begin, and the k'th element of `counts` is the // length of that list. vector shape; shape.push_back(bottom[1]->count()); Blob top_indexes(shape); shape[0] = bottom[0]->shape(0); Blob counts(shape); Blob begins(shape); Dtype* t_i_data = top_indexes.mutable_cpu_data(); Dtype* c_data = counts.mutable_cpu_data(); Dtype* b_data = begins.mutable_cpu_data(); caffe_set(begins.count(), Dtype(-1), b_data); caffe_set(counts.count(), Dtype(0), c_data); for (int i = 0; i < mapping.size(); ++i) { t_i_data[i] = mapping[i].second; if (b_data[mapping[i].first] == -1) { b_data[mapping[i].first] = i; } c_data[mapping[i].first] += 1; } int threads = bottom[0]->count(); // NOLINT_NEXT_LINE(whitespace/operators) BRBackward <<>>( bottom[0]->count(), bottom[0]->count() / bottom[0]->shape(0), top[0]->gpu_diff(), top_indexes.gpu_data(), begins.gpu_data(), counts.gpu_data(), bottom[0]->mutable_gpu_diff()); CUDA_POST_KERNEL_CHECK; } INSTANTIATE_LAYER_GPU_FUNCS(BatchReindexLayer); } // namespace caffe