未验证 提交 f83254d6 编写于 作者: A Aurelius84 提交者: GitHub

cherry-pick pyramid_hash op test=develop (#20779)(#18525) (#21562)

上级 e228e707
......@@ -48,14 +48,17 @@ if (WITH_DISTRIBUTE)
SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
endif()
SET(OP_ONLY_MKL "")
if (NOT WITH_MKL)
SET(OP_ONLY_MKL ${OP_ONLY_MKL} match_matrix_tensor_op)
SET(OP_ONLY_MKL ${OP_ONLY_MKL} var_conv_2d_op)
SET(OP_MKL_DEPS "")
if (NOT WITH_MKL OR NOT WITH_AVX)
SET(OP_MKL_DEPS ${OP_MKL_DEPS} match_matrix_tensor_op)
SET(OP_MKL_DEPS ${OP_MKL_DEPS} var_conv_2d_op)
endif()
if(WITH_COVERAGE OR NOT WITH_AVX OR WIN32)
SET(OP_MKL_DEPS ${OP_MKL_DEPS} pyramid_hash_op)
endif()
register_operators(EXCLUDES py_func_op warpctc_op dgc_op conv_fusion_op
sync_batch_norm_op multihead_matmul_op ${OP_ONLY_MKL} DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
sync_batch_norm_op multihead_matmul_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
if (WITH_GPU)
# warpctc_op needs cudnn 7 above
......
......@@ -286,8 +286,8 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
auto* r_data = bottom_r_data + (offset_r[b] + j) * dim_in;
auto* r_diff = bottom_r_diff + (offset_r[b] + j) * dim_in;
if (diff != 0.0) {
sse_axpy(r_data, l_trans_diff, dim_in, diff);
sse_axpy(l_trans_data, r_diff, dim_in, diff);
avx_axpy(r_data, l_trans_diff, dim_in, diff);
avx_axpy(l_trans_data, r_diff, dim_in, diff);
}
}
}
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define BLOOMFILTER_MAGIC_NUM_NEW 17070416
#include <inttypes.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
namespace paddle {
namespace operators {
namespace math {
#pragma pack(4)
struct bloomfilter {
uint64_t magic_num;
uint64_t m;
uint64_t k;
uint64_t count;
unsigned char bit_vector[1];
};
int bloomfilter_get(const struct bloomfilter *bloomfilter, const void *key,
size_t len);
int bloomfilter_check(struct bloomfilter *filter);
#define bit_get(v, n) ((v)[(n) >> 3] & (0x1 << (0x7 - ((n)&0x7))))
#define ROTL64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
#define BIG_CONSTANT(x) (x##LLU)
uint64_t fmix64(uint64_t k) {
k ^= k >> 33;
k *= BIG_CONSTANT(0xff51afd7ed558ccd);
k ^= k >> 33;
k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
k ^= k >> 33;
return k;
}
void murmurhash3_x64_128(const void *key, const int len, const uint32_t seed,
void *out) {
const uint8_t *data = (const uint8_t *)key;
const int nblocks = len / 16;
uint64_t h1 = seed;
uint64_t h2 = seed;
int i = 0;
const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
//----------
// body
const uint64_t *blocks = (const uint64_t *)(data);
uint64_t k1;
uint64_t k2;
for (i = 0; i < nblocks; i++) {
k1 = blocks[i * 2 + 0];
k2 = blocks[i * 2 + 1];
k1 *= c1;
k1 = ROTL64(k1, 31);
k1 *= c2;
h1 ^= k1;
h1 = ROTL64(h1, 27);
h1 += h2;
h1 = h1 * 5 + 0x52dce729;
k2 *= c2;
k2 = ROTL64(k2, 33);
k2 *= c1;
h2 ^= k2;
h2 = ROTL64(h2, 31);
h2 += h1;
h2 = h2 * 5 + 0x38495ab5;
}
//----------
// tail
const uint8_t *tail = (const uint8_t *)(data + nblocks * 16);
uint64_t nk1 = 0;
uint64_t nk2 = 0;
uint64_t tail0_64 = *(uint64_t *)(tail); // NOLINT
uint64_t tail_64 = *(uint64_t *)(tail + 8); // NOLINT
uint64_t mask0 = 0xffffffffffffffff;
uint64_t mask = 0x00ffffffffffffff;
int flag = len & 15;
if (flag && flag <= 8) {
tail0_64 &= (mask0 >> ((8 - flag) << 3));
} else if (flag > 8) {
tail_64 &= (mask >> ((15 - flag) << 3));
nk2 ^= tail_64;
nk2 *= c2;
nk2 = ROTL64(nk2, 33);
nk2 *= c1;
h2 ^= nk2;
}
if (flag) {
nk1 ^= tail0_64;
nk1 *= c1;
nk1 = ROTL64(nk1, 31);
nk1 *= c2;
h1 ^= nk1;
}
//----------
// finalization
h1 ^= len;
h2 ^= len;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
h2 += h1;
reinterpret_cast<uint64_t *>(out)[0] = h1;
reinterpret_cast<uint64_t *>(out)[1] = h2;
}
int bloomfilter_check(struct bloomfilter *filter) {
if (filter->magic_num == BLOOMFILTER_MAGIC_NUM_NEW) {
return 1;
} else {
fprintf(stderr, "error magic_num %ld\n", filter->magic_num);
return 0;
}
}
int bloomfilter_get(const struct bloomfilter *bloomfilter, const void *key,
size_t len) {
uint32_t i;
uint64_t result[2];
for (i = 0; i < bloomfilter->k; i++) {
murmurhash3_x64_128(key, len, i, &result);
result[0] %= bloomfilter->m;
result[1] %= bloomfilter->m;
if (!bit_get(bloomfilter->bit_vector, result[0])) {
return 0;
}
if (!bit_get(bloomfilter->bit_vector, result[1])) {
return 0;
}
}
return 1;
}
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <xxhash.h>
#include <algorithm>
#include <cmath>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/search_compute.h"
extern "C" {
#include "math/bloomfilter.h"
}
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using LoD = framework::LoD;
class PyramidHashOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"X (Tensor, MUST be Tensor<!!!_int32_!!!>) Input variable which "
"should contain lod information.");
AddInput("W", "W (Tensor)");
AddInput("WhiteList", "WhiteList (Tensor)");
AddInput("BlackList", "BlackList (Tensor)");
AddAttr<int>("num_emb", "num_emb").SetDefault(0).EqualGreaterThan(0);
AddAttr<int>("space_len", "space_len").SetDefault(0).EqualGreaterThan(0);
AddAttr<int>("pyramid_layer", "pyramid_layer (must be >= 2)")
.SetDefault(2)
.EqualGreaterThan(2);
AddAttr<int>("rand_len", "rand_len").SetDefault(0).EqualGreaterThan(0);
AddAttr<float>("drop_out_percent", "drop_out_percent")
.SetDefault(0)
.EqualGreaterThan(0);
AddAttr<int>("is_training", "is_training")
.SetDefault(0)
.EqualGreaterThan(0);
AddAttr<bool>("use_filter", "use_filter").SetDefault(true);
AddAttr<int>("white_list_len", "white_list_len")
.SetDefault(0)
.EqualGreaterThan(0);
AddAttr<int>("black_list_len", "black_list_len")
.SetDefault(0)
.EqualGreaterThan(0);
AddAttr<int>("seed", "seed").SetDefault(0).EqualGreaterThan(0);
AddAttr<float>("lr", "learning rate").SetDefault(0.0).EqualGreaterThan(0.0);
AddOutput("Out", "Out (Tensor, default Tensor<float>) Output variable");
AddOutput("DropPos", "Out (Tensor, Tensor<int>) Output variable");
AddOutput("X_Temp_Out", "Out (Tensor, Tensor<int>) Output variable")
.AsIntermediate();
AddComment(R"DOC(
PyramidHash
NOTE: only support 'float32' data type now.
)DOC");
}
};
class PyramidHashOP : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "X(Input) should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true, "W(Input) should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
"Out(Output) should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasOutput("DropPos"), true,
"DropPos(TMP Output) should not be null.");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2.");
auto w_dims = ctx->GetInputDim("W");
PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor");
int space_len = ctx->Attrs().Get<int>("space_len");
int rand_len = ctx->Attrs().Get<int>("rand_len");
PADDLE_ENFORCE_EQ(w_dims[0], space_len + rand_len,
"w_dims[0] should be equal to (space_len + rand_len)");
PADDLE_ENFORCE_EQ(w_dims[1], 1, "w_dims[1] should be equal to 1");
int num_emb = ctx->Attrs().Get<int>("num_emb");
PADDLE_ENFORCE_EQ(num_emb % rand_len, 0,
"random length should mod embedding size");
int white_list_len = ctx->Attrs().Get<int>("white_list_len");
if (white_list_len > 0) {
PADDLE_ENFORCE_EQ(
ctx->HasInput("WhiteList"), true,
"WhiteList(Input) should not be null when white_list_len > 0");
auto wl_dims = ctx->GetInputDim("WhiteList");
PADDLE_ENFORCE_EQ(wl_dims.size(), 2, "WhiteList should be 2-D tensor");
PADDLE_ENFORCE_EQ(wl_dims[0], white_list_len,
"wl_dims[0] should be equal to white_list_len");
PADDLE_ENFORCE_EQ(wl_dims[1], 1, "wl_dims[1] should be equal to 1");
}
int black_list_len = ctx->Attrs().Get<int>("black_list_len");
if (black_list_len > 0) {
PADDLE_ENFORCE_EQ(
ctx->HasInput("BlackList"), true,
"BlackList(Input) should not be null when black_list_len > 0");
auto bl_dims = ctx->GetInputDim("BlackList");
PADDLE_ENFORCE_EQ(bl_dims.size(), 2, "BlackList should be 2-D tensor");
PADDLE_ENFORCE_EQ(bl_dims[0], black_list_len,
"bl_dims[0] should be equal to black_list_len");
PADDLE_ENFORCE_EQ(bl_dims[1], 1, "bl_dims[1] should be equal to 1");
}
if (ctx->IsRuntime()) {
// something to do in runtime.
} else {
// compile time
ctx->SetOutputDim("Out", framework::make_ddim({-1, num_emb}));
ctx->SetOutputDim("X_Temp_Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "W"), ctx.GetPlace());
}
};
template <typename DeviceContext, typename T>
class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
public:
bool should_use_term(math::bloomfilter* _filter,
math::bloomfilter* _black_filter, const T* word_repr,
int len) const {
return (!_filter ||
1 == math::bloomfilter_get(_filter, word_repr, len * sizeof(T))) &&
(!_black_filter ||
0 == math::bloomfilter_get(_black_filter, word_repr,
len * sizeof(T)));
}
void hash_embedding_ff(const T* hash_id, int len, T* top_pos,
const T* weights, int _num_emb, int _rand_len,
int _space_len) const {
unsigned int pos1 = XXH32(hash_id, len * sizeof(T), 0) % _space_len;
unsigned int pos2 = XXH32(hash_id, len * sizeof(T), _rand_len) % _space_len;
for (int j = 0; j != _num_emb; j += _rand_len) {
if (j + _rand_len < _num_emb) {
__builtin_prefetch(weights + pos2);
__builtin_prefetch(top_pos + j + _rand_len);
}
unsigned int pos3 =
XXH32(hash_id, len * sizeof(T), j + 2 * _rand_len) % _space_len;
memcpy(top_pos + j, const_cast<float*>(weights + pos1),
_rand_len * sizeof(T));
pos1 = pos2;
pos2 = pos3;
}
}
void Compute(const framework::ExecutionContext& ctx) const override {
auto* bottom = ctx.Input<LoDTensor>("X");
auto* _blobs_0 = ctx.Input<Tensor>("W");
auto* _blobs_1 = ctx.Input<Tensor>("WhiteList");
auto* _blobs_2 = ctx.Input<Tensor>("BlackList");
auto* top = ctx.Output<LoDTensor>("Out");
auto* drop_pos = ctx.Output<LoDTensor>("DropPos");
int _num_emb = ctx.Attr<int>("num_emb");
bool use_filter = ctx.Attr<bool>("use_filter");
int white_list_len = ctx.Attr<int>("white_list_len");
int black_list_len = ctx.Attr<int>("black_list_len");
int _pyramid_layer = ctx.Attr<int>("pyramid_layer");
int _is_training = ctx.Attr<int>("is_training");
int seed = ctx.Attr<int>("seed");
unsigned int _seed = (unsigned int)seed;
int _rand_len = ctx.Attr<int>("rand_len");
int _space_len = ctx.Attr<int>("space_len");
float _drop_out_percent = ctx.Attr<float>("drop_out_percent");
const auto& offset = bottom->lod()[0];
const auto* bottom_data_ori = bottom->data<int32_t>();
auto* buff = ctx.Output<LoDTensor>("X_Temp_Out");
buff->Resize(framework::make_ddim({bottom->dims()[0], bottom->dims()[1]}));
T* bottom_data = buff->mutable_data<T>(ctx.GetPlace());
for (int i = 0; i < bottom->dims()[0]; i++) {
bottom_data[i] = bottom_data_ori[i];
}
const auto* weights = _blobs_0->data<T>();
std::vector<size_t> top_offset;
top_offset.resize(offset.size());
top_offset[0] = 0;
math::bloomfilter* _filter = NULL;
math::bloomfilter* _black_filter = NULL;
if (use_filter) {
if (white_list_len != 0) {
_filter = (math::bloomfilter*)_blobs_1->data<T>();
PADDLE_ENFORCE_EQ(math::bloomfilter_check(_filter), 1,
"white filter not load");
}
if (black_list_len != 0) {
_black_filter = (math::bloomfilter*)_blobs_2->data<T>();
PADDLE_ENFORCE_EQ(math::bloomfilter_check(_black_filter), 1,
"black filter not load");
}
}
drop_pos->Resize(framework::make_ddim(
{bottom->dims()[0] * bottom->dims()[1] * _pyramid_layer, 1}));
std::vector<size_t> drop_pos_offset;
drop_pos_offset.resize(offset.size());
drop_pos_offset[0] = 0;
int* iter = drop_pos->mutable_data<int>(ctx.GetPlace());
int* iter_end = iter;
for (size_t i = 0; i < top_offset.size() - 1; ++i) {
int w = offset[i + 1] - offset[i];
int nsentense_with_pyramid = 0;
if (w < 2) {
nsentense_with_pyramid = 0;
} else {
for (int ilayer = 1; ilayer < _pyramid_layer && ilayer < w; ++ilayer) {
for (int l = 0; l < w - ilayer; ++l) {
if (should_use_term(_filter, _black_filter,
(const T*)(bottom_data + offset[i] + l),
ilayer + 1)) {
if (_is_training != 0) {
unsigned int rand_val = rand_r(&_seed);
T rate = static_cast<T>(rand_val) / (RAND_MAX);
*(iter_end++) = (rate < _drop_out_percent ? 0 : 1);
} else {
*(iter_end++) = 1;
}
} else {
*(iter_end++) = 0;
}
}
}
nsentense_with_pyramid = std::count(iter, iter_end, 1);
iter = iter_end;
}
drop_pos_offset[i + 1] = drop_pos_offset[i] + nsentense_with_pyramid;
top_offset[i + 1] =
top_offset[i] +
(nsentense_with_pyramid == 0 ? 1 : nsentense_with_pyramid);
}
int top_l = top_offset[top_offset.size() - 1];
framework::LoD top_lod;
top_lod.push_back(top_offset);
top->set_lod(top_lod);
top->Resize(framework::make_ddim({top_l, _num_emb}));
auto* top_data = top->mutable_data<T>(ctx.GetPlace());
framework::LoD drop_pos_lod;
drop_pos_lod.push_back(drop_pos_offset);
drop_pos->set_lod(drop_pos_lod);
iter = drop_pos->mutable_data<int>(ctx.GetPlace());
int top_counter = 0;
for (size_t i = 0; i < offset.size() - 1; ++i) {
int w_drop = drop_pos_offset[i + 1] - drop_pos_offset[i];
int w = offset[i + 1] - offset[i];
if (w_drop == 0) {
if (w >= 2) {
for (int ilayer = 1; ilayer < _pyramid_layer && ilayer < w;
++ilayer) {
for (int l = 0; l < w - ilayer; ++l) {
iter++;
}
}
}
auto* top_pos = top_data + top_counter++ * _num_emb;
memset(top_pos, 0, _num_emb * sizeof(T));
continue;
}
if (w >= 2) {
for (int ilayer = 1; ilayer < _pyramid_layer && ilayer < w; ++ilayer) {
for (int l = 0; l < w - ilayer; ++l) {
if (*(iter++) == 0) {
// do nothing
} else {
auto* top_pos = top_data + top_counter++ * _num_emb;
hash_embedding_ff((const T*)(bottom_data + offset[i] + l),
ilayer + 1, top_pos, weights, _num_emb,
_rand_len, _space_len);
}
}
}
}
}
if (iter != iter_end) {
exit(1);
}
if (_is_training == 0) {
avx_axpy_noadd(top_data, top_data, top->dims()[0] * top->dims()[1],
_drop_out_percent);
}
}
};
class PyramidHashOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true, "Input(W) should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("DropPos"), true,
"Input(DropPos) should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("X_Temp_Out"), true,
"Input(X_Temp_Out) should not be null.");
PADDLE_ENFORCE_EQ(
ctx->HasInput(framework::GradVarName("Out")), true,
"Input(Out@GRAD) of PyramidHashGradOp should not be null.");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "W"), ctx.GetPlace());
}
};
class PyramidHashGradOpMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op_desc_ptr = new framework::OpDesc();
op_desc_ptr->SetType("pyramid_hash_grad");
op_desc_ptr->SetInput("X", Input("X"));
op_desc_ptr->SetInput("W", Input("W"));
op_desc_ptr->SetInput("DropPos", Output("DropPos"));
op_desc_ptr->SetInput("X_Temp_Out", Output("X_Temp_Out"));
op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op_desc_ptr->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
}
};
template <typename DeviceContext, typename T>
class CPUPyramidHashOPGradKernel : public framework::OpKernel<T> {
public:
void hash_embedding_bp(const T* hash_id, int len, const T* top_pos,
T* weights, T mlr, int _num_emb, int _rand_len,
int _space_len) const {
for (int j = 0; j != _num_emb; j += _rand_len) {
unsigned int pos = XXH32(hash_id, len * sizeof(T), j) % _space_len;
avx_axpy(top_pos + j, weights + pos, _rand_len, mlr);
}
}
void Compute(const framework::ExecutionContext& ctx) const override {
auto* bottom = ctx.Input<LoDTensor>("X");
auto* _blobs = ctx.Input<Tensor>("W");
auto* drop_pos = ctx.Input<LoDTensor>("DropPos");
auto* top = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
int _num_emb = ctx.Attr<int>("num_emb");
float _lr = ctx.Attr<float>("lr");
int _rand_len = ctx.Attr<int>("rand_len");
int _space_len = ctx.Attr<int>("space_len");
int _pyramid_layer = ctx.Attr<int>("pyramid_layer");
auto* buff = ctx.Input<LoDTensor>("X_Temp_Out");
auto* bottom_data = buff->data<T>();
int _slot_len = bottom->dims()[0];
if (static_cast<size_t>(_slot_len) == bottom->lod()[0].size() - 1 &&
std::count(bottom_data, bottom_data + _slot_len, -1) == _slot_len) {
return;
}
auto& offset = bottom->lod()[0];
auto& drop_pos_offset = drop_pos->lod()[0];
const auto* top_diff = top->data<T>();
T* weights = const_cast<T*>(_blobs->data<T>());
T mlr = -1.0 * _lr;
const int* iter = drop_pos->data<int>();
int top_counter = 0;
for (size_t i = 0; i < offset.size() - 1; ++i) {
int w = offset[i + 1] - offset[i];
int w_drop = drop_pos_offset[i + 1] - drop_pos_offset[i];
if (w_drop == 0) {
top_counter++;
}
if (w > 1) {
for (int ilayer = 1; ilayer < _pyramid_layer && ilayer < w; ++ilayer) {
for (int l = 0; l < w - ilayer; ++l) {
if (*(iter++) == 0) {
// do nothing
} else {
const T* top_pos = top_diff + top_counter++ * _num_emb;
hash_embedding_bp((const T*)(bottom_data + offset[i] + l),
ilayer + 1, top_pos, weights, mlr, _num_emb,
_rand_len, _space_len);
}
}
}
} else {
// do nothing
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plt = paddle::platform;
namespace frm = paddle::framework;
REGISTER_OPERATOR(pyramid_hash, ops::PyramidHashOP, ops::PyramidHashOpMaker,
ops::PyramidHashGradOpMaker);
REGISTER_OPERATOR(pyramid_hash_grad, ops::PyramidHashOpGrad);
REGISTER_OP_CPU_KERNEL(
pyramid_hash, ops::CPUPyramidHashOPKernel<plt::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
pyramid_hash_grad,
ops::CPUPyramidHashOPGradKernel<plt::CPUDeviceContext, float>);
......@@ -21,7 +21,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/dynload/mklml.h"
namespace paddle {
namespace operators {
......@@ -73,22 +72,10 @@ void call_gemm_batched(const framework::ExecutionContext& ctx,
}
}
#ifndef TYPE_USE_FLOAT
#define TYPE_USE_FLOAT
#endif
#ifndef USE_SSE
#define USE_SSE
#endif
#if defined(TYPE_USE_FLOAT)
#define __m256x __m256
#define __m128x __m128
static const unsigned int AVX_STEP_SIZE = 8;
static const unsigned int SSE_STEP_SIZE = 4;
static const unsigned int AVX_CUT_LEN_MASK = 7U;
static const unsigned int SSE_CUT_LEN_MASK = 3U;
#define _mm256_mul_px _mm256_mul_ps
#define _mm256_add_px _mm256_add_ps
......@@ -96,20 +83,11 @@ static const unsigned int SSE_CUT_LEN_MASK = 3U;
#define _mm256_store_px _mm256_storeu_ps
#define _mm256_broadcast_sx _mm256_broadcast_ss
#define _mm_add_px _mm_add_ps
#define _mm_mul_px _mm_mul_ps
#define _mm_load_px _mm_loadu_ps
#define _mm_store_px _mm_storeu_ps
#define _mm_load1_px _mm_load1_ps
#endif
template <typename T>
inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) {
inline void avx_axpy(const T* x, T* y, size_t len, const T alpha) {
unsigned int jjj, lll;
jjj = lll = 0;
#if defined(USE_AVX)
lll = len & ~AVX_CUT_LEN_MASK;
__m256x mm_alpha = _mm256_broadcast_sx(&alpha);
for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
......@@ -119,18 +97,24 @@ inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) {
_mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))));
}
#elif defined(USE_SSE)
lll = len & ~SSE_CUT_LEN_MASK;
__m128x mm_alpha = _mm_load1_px(&alpha);
for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
_mm_store_px(y + jjj,
_mm_add_px(_mm_load_px(y + jjj),
_mm_mul_px(mm_alpha, _mm_load_px(x + jjj))));
for (; jjj < len; jjj++) {
y[jjj] += alpha * x[jjj];
}
}
template <typename T>
inline void avx_axpy_noadd(const T* x, T* y, size_t len, const T alpha) {
unsigned int jjj, lll;
jjj = lll = 0;
lll = len & ~AVX_CUT_LEN_MASK;
__m256x mm_alpha = _mm256_broadcast_sx(&alpha);
for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
_mm256_store_px(y + jjj, _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)));
}
#endif
for (; jjj < len; jjj++) {
y[jjj] += alpha * x[jjj];
y[jjj] = alpha * x[jjj];
}
}
......
......@@ -31,6 +31,7 @@ __all__ = [
'match_matrix_tensor',
'tree_conv',
'multiclass_nms2',
'search_pyramid_hash',
]
......@@ -563,3 +564,98 @@ def multiclass_nms2(bboxes,
if return_index:
return output, index
return output
def search_pyramid_hash(input,
num_emb,
space_len,
pyramid_layer,
rand_len,
drop_out_percent,
is_training,
use_filter,
white_list_len,
black_list_len,
seed,
lr,
param_attr=None,
param_attr_wl=None,
param_attr_bl=None,
name=None,
dtype='float32'):
"""
**Pyramid hash embedding**
Args:
input (Variable): LoDTensor<int32> Variable contained the IDs' information.
num_emb (int): The embedding size of output.
space_len (int): The length of pyramid hash embedding space.
pyramid_layer (int): The number of pyramid layers. It should be greater than 2.
rand_len (int): The minimum length of pyramid hash cell.
drop_out_percent (float): The probability of dropping out the input token randomly.
It should satisfy: [0., 1.]
is_training (bool): Whether in training or testing phrase.
use_filter(bool): If set True, the white filter and black filter should be given by
:attr:`param_attr_wl` and :attr:`param_attr_bl` .
white_list_len(int): If set :math:`white_list_len>0` , white filter with shape [white_list_len, 1]
should be provided by param_attr_wl.
black_list_len(int): If set :math:`black_list_len>0` , black filter with shape [black_list_len, 1]
should be provided by param_attr_bl.
seed(int): The number of random seed.
lr(float): The learning rate of weight created by :attr:`param_attr` with shape [space_len+rand_len, 1]
in this layer.
param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` .
param_attr_wl(ParamAttr): Specified parameters of white filter.
param_attr_bl(ParamAttr): Specified parameters of black filter.
name(str, optional): The default value is None. Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name` .
dtype(str): The data type of output variable, float32.
Returns:
Variable: LoDTensor of pyramid hash embedding.
"""
helper = LayerHelper('search_pyramid_hash', **locals())
w_shape = [space_len + rand_len, 1]
w = helper.create_parameter(
attr=param_attr, shape=w_shape, dtype=dtype, is_bias=False)
w.stop_gradient = True
input_vars = {'X': input, 'W': w}
if white_list_len > 0:
wl_shape = [white_list_len, 1]
white_list = helper.create_parameter(
attr=param_attr_wl, shape=wl_shape, dtype=dtype, is_bias=False)
white_list.stop_gradient = True
input_vars['WhiteList'] = white_list
if black_list_len >= 0:
bl_shape = [black_list_len, 1]
black_list = helper.create_parameter(
attr=param_attr_bl, shape=bl_shape, dtype=dtype, is_bias=False)
black_list.stop_gradient = True
input_vars['BlackList'] = black_list
res = helper.create_variable_for_type_inference(dtype)
drop_pos = helper.create_variable_for_type_inference(dtype)
x_temp_out = helper.create_variable_for_type_inference(dtype)
helper.append_op(
type='pyramid_hash',
inputs=input_vars,
outputs={"Out": res,
"X_Temp_Out": x_temp_out,
'DropPos': drop_pos},
attrs={
'num_emb': num_emb,
'space_len': space_len,
'pyramid_layer': pyramid_layer,
'rand_len': rand_len,
'drop_out_percent': drop_out_percent,
'is_training': is_training,
'use_filter': use_filter,
'white_list_len': white_list_len,
'black_list_len': black_list_len,
'seed': seed,
'lr': lr,
})
return res
......@@ -74,6 +74,10 @@ if(NOT WITH_MKL)
list(REMOVE_ITEM TEST_OPS test_var_conv_2d)
endif(NOT WITH_MKL)
if(WITH_COVERAGE OR NOT WITH_AVX OR WIN32)
list(REMOVE_ITEM TEST_OPS test_pyramid_hash_op)
endif()
if(WITH_GPU OR NOT WITH_MKLML)
# matmul with multiple heads need MKL support
LIST(REMOVE_ITEM TEST_OPS test_matmul_op_with_head)
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid as fluid
class TestPyramidHashOpApi(unittest.TestCase):
def test_api(self):
num_voc = 128
embed_dim = 64
x_shape, x_lod = [16, 10], [[3, 5, 2, 6]]
x = fluid.data(name='x', shape=x_shape, dtype='int32', lod_level=1)
hash_embd = fluid.contrib.search_pyramid_hash(
input=x,
num_emb=embed_dim,
space_len=num_voc * embed_dim,
pyramid_layer=4,
rand_len=16,
drop_out_percent=0.5,
is_training=True,
use_filter=False,
white_list_len=6400,
black_list_len=2800,
seed=3,
lr=0.002,
param_attr=fluid.ParamAttr(
name="PyramidHash_emb_0",
learning_rate=0, ),
param_attr_wl=fluid.ParamAttr(
name="Filter",
learning_rate=0, ),
param_attr_bl=None,
name=None, )
place = fluid.CPUPlace()
x_tensor = fluid.create_lod_tensor(
np.random.randint(0, num_voc, x_shape).astype('int32'), x_lod,
place)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
ret = exe.run(feed={'x': x_tensor},
fetch_list=[hash_embd],
return_numpy=False)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册