未验证 提交 c6331afa 编写于 作者: O O2Dyokii 提交者: GitHub

Merge pull request #7 from O2Dyokii/rs

PNN-demo
RAIN_FILE = "data/train.csv"
TEST_FILE = "data/test.csv"
SUB_DIR = "output"
NUM_SPLITS = 3
RANDOM_SEED = 2017
# types of columns of the dataset dataframe
CATEGORICAL_COLS = [
# 'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat',
# 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
# 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
# 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
# 'ps_car_10_cat', 'ps_car_11_cat',
]
NUMERIC_COLS = [
# # binary
# "ps_ind_06_bin", "ps_ind_07_bin", "ps_ind_08_bin",
# "ps_ind_09_bin", "ps_ind_10_bin", "ps_ind_11_bin",
# "ps_ind_12_bin", "ps_ind_13_bin", "ps_ind_16_bin",
# "ps_ind_17_bin", "ps_ind_18_bin",
# "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
# "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin",
# numeric
"ps_reg_01", "ps_reg_02", "ps_reg_03",
"ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",
# feature engineering
"missing_feat", "ps_car_13_x_ps_reg_03",
]
IGNORE_COLS = [
"id", "target",
"ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
"ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08",
"ps_calc_09", "ps_calc_10", "ps_calc_11", "ps_calc_12",
"ps_calc_13", "ps_calc_14",
"ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
"ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin"
]
\ No newline at end of file
因为 它太大了无法显示 source diff 。你可以改为 查看blob
此差异已折叠。
import pandas as pd
class FeatureDictionary(object):
def __init__(self,trainfile=None,testfile=None,
dfTrain=None,dfTest=None,numeric_cols=[],
ignore_cols=[]):
assert not ((trainfile is None) and (dfTrain is None)), "trainfile or dfTrain at least one is set"
assert not ((trainfile is not None) and (dfTrain is not None)), "only one can be set"
assert not ((testfile is None) and (dfTest is None)), "testfile or dfTest at least one is set"
assert not ((testfile is not None) and (dfTest is not None)), "only one can be set"
self.trainfile = trainfile
self.testfile = testfile
self.dfTrain = dfTrain
self.dfTest = dfTest
self.numeric_cols = numeric_cols
self.ignore_cols = ignore_cols
self.gen_feat_dict()
def gen_feat_dict(self):
if self.dfTrain is None:
dfTrain = pd.read_csv(self.trainfile)
else:
dfTrain = self.dfTrain
if self.dfTest is None:
dfTest = pd.read_csv(self.testfile)
else:
dfTest = self.dfTest
df = pd.concat([dfTrain,dfTest])
self.feat_dict = {}
tc = 0
for col in df.columns:
if col in self.ignore_cols:
continue
if col in self.numeric_cols:
self.feat_dict[col] = tc
tc += 1
else:
us = df[col].unique()
print(us)
self.feat_dict[col] = dict(zip(us,range(tc,len(us)+tc)))
tc += len(us)
self.feat_dim = tc
class DataParser(object):
def __init__(self,feat_dict):
self.feat_dict = feat_dict
def parse(self,infile=None,df=None,has_label=False):
assert not ((infile is None) and (df is None)), "infile or df at least one is set"
assert not ((infile is not None) and (df is not None)), "only one can be set"
if infile is None:
dfi = df.copy()
else:
dfi = pd.read_csv(infile)
if has_label:
y = dfi['target'].values.tolist()
dfi.drop(['id','target'],axis=1,inplace=True)
else:
ids = dfi['id'].values.tolist()
dfi.drop(['id'],axis=1,inplace=True)
# dfi for feature index
# dfv for feature value which can be either binary (1/0) or float (e.g., 10.24)
dfv = dfi.copy()
for col in dfi.columns:
if col in self.feat_dict.ignore_cols:
dfi.drop(col,axis=1,inplace=True)
dfv.drop(col,axis=1,inplace=True)
continue
if col in self.feat_dict.numeric_cols:
dfi[col] = self.feat_dict.feat_dict[col]
else:
dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
dfv[col] = 1.
xi = dfi.values.tolist()
xv = dfv.values.tolist()
if has_label:
return xi,xv,y
else:
return xi,xv,ids
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from DataReader import FeatureDictionary, DataParser
from matplotlib import pyplot as plt
import config
from model import PNN
def load_data():
dfTrain = pd.read_csv(config.TRAIN_FILE)
dfTest = pd.read_csv(config.TEST_FILE)
def preprocess(df):
cols = [c for c in df.columns if c not in ['id','target']]
#df['missing_feat'] = np.sum(df[df[cols]==-1].values,axis=1)
df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
return df
dfTrain = preprocess(dfTrain)
dfTest = preprocess(dfTest)
cols = [c for c in dfTrain.columns if c not in ['id','target']]
cols = [c for c in cols if (not c in config.IGNORE_COLS)]
X_train = dfTrain[cols].values
y_train = dfTrain['target'].values
X_test = dfTest[cols].values
ids_test = dfTest['id'].values
cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS]
return dfTrain,dfTest,X_train,y_train,X_test,ids_test,cat_features_indices
def run_base_model_pnn(dfTrain,dfTest,folds,pnn_params):
fd = FeatureDictionary(dfTrain=dfTrain,
dfTest=dfTest,
numeric_cols=config.NUMERIC_COLS,
ignore_cols = config.IGNORE_COLS)
data_parser = DataParser(feat_dict= fd)
# Xi_train :列的序号
# Xv_train :列的对应的值
Xi_train,Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True)
Xi_test,Xv_test,ids_test = data_parser.parse(df=dfTest)
print(dfTrain.dtypes)
pnn_params['feature_size'] = fd.feat_dim
pnn_params['field_size'] = len(Xi_train[0])
_get = lambda x,l:[x[i] for i in l]
for i, (train_idx, valid_idx) in enumerate(folds):
Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)
pnn = PNN(**pnn_params)
pnn.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)
pnn_params = {
"embedding_size":8,
"deep_layers":[32,32],
"dropout_deep":[0.5,0.5,0.5],
"deep_layer_activation":tf.nn.relu,
"epoch":30,
"batch_size":1024,
"learning_rate":0.001,
"optimizer":"adam",
"batch_norm":1,
"batch_norm_decay":0.995,
"verbose":True,
"random_seed":config.RANDOM_SEED,
"deep_init_size":50,
"use_inner":False
}
# load data
dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices = load_data()
# folds
folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True,
random_state=config.RANDOM_SEED).split(X_train, y_train))
#y_train_pnn,y_test_pnn = run_base_model_pnn(dfTrain,dfTest,folds,pnn_params)
y_train_pnn, y_test_pnn = run_base_model_pnn(dfTrain, dfTest, folds, pnn_params)
import numpy as np
import tensorflow as tf
from time import time
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score
class PNN(BaseEstimator, TransformerMixin):
def __init__(self, feature_size, field_size,
embedding_size=8,
deep_layers=[32, 32], deep_init_size = 50,
dropout_deep=[0.5, 0.5, 0.5],
deep_layer_activation=tf.nn.relu,
epoch=10, batch_size=256,
learning_rate=0.001, optimizer="adam",
batch_norm=0, batch_norm_decay=0.995,
verbose=False, random_seed=2016,
loss_type="logloss", eval_metric=roc_auc_score,
greater_is_better=True,
use_inner=True):
assert loss_type in ["logloss", "mse"], \
"loss_type can be either 'logloss' for classification task or 'mse' for regression task"
self.feature_size = feature_size
self.field_size = field_size
self.embedding_size = embedding_size
self.deep_layers = deep_layers
self.deep_init_size = deep_init_size
self.dropout_dep = dropout_deep
self.deep_layers_activation = deep_layer_activation
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer
self.batch_norm = batch_norm
self.batch_norm_decay = batch_norm_decay
self.verbose = verbose
self.random_seed = random_seed
self.loss_type = loss_type
self.eval_metric = eval_metric
self.greater_is_better = greater_is_better
self.train_result,self.valid_result = [],[]
self.use_inner = use_inner
self._init_graph()
def _init_graph(self):
self.graph = tf.Graph()
with self.graph.as_default():
tf.set_random_seed(self.random_seed)
self.feat_index = tf.placeholder(tf.int32,
shape=[None,None],
name='feat_index')
self.feat_value = tf.placeholder(tf.float32,
shape=[None,None],
name='feat_value')
self.label = tf.placeholder(tf.float32,shape=[None,1],name='label')
self.dropout_keep_deep = tf.placeholder(tf.float32,shape=[None],name='dropout_deep_deep')
self.train_phase = tf.placeholder(tf.bool,name='train_phase')
self.weights = self._initialize_weights()
# Embeddings
self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'],self.feat_index) # N * F * K
feat_value = tf.reshape(self.feat_value,shape=[-1,self.field_size,1])
self.embeddings = tf.multiply(self.embeddings,feat_value) # N * F * K
# Linear Singal
linear_output = []
for i in range(self.deep_init_size):
linear_output.append(tf.reshape(
tf.reduce_sum(tf.multiply(self.embeddings,self.weights['product-linear'][i]),axis=[1,2]),shape=(-1,1)))# N * 1
self.lz = tf.concat(linear_output,axis=1) # N * init_deep_size
# Quardatic Singal
quadratic_output = []
if self.use_inner:
for i in range(self.deep_init_size):
theta = tf.multiply(self.embeddings,tf.reshape(self.weights['product-quadratic-inner'][i],(1,-1,1))) # N * F * K
quadratic_output.append(tf.reshape(tf.norm(tf.reduce_sum(theta,axis=1),axis=1),shape=(-1,1))) # N * 1
else:
embedding_sum = tf.reduce_sum(self.embeddings,axis=1)
p = tf.matmul(tf.expand_dims(embedding_sum,2),tf.expand_dims(embedding_sum,1)) # N * K * K
for i in range(self.deep_init_size):
theta = tf.multiply(p,tf.expand_dims(self.weights['product-quadratic-outer'][i],0)) # N * K * K
quadratic_output.append(tf.reshape(tf.reduce_sum(theta,axis=[1,2]),shape=(-1,1))) # N * 1
self.lp = tf.concat(quadratic_output,axis=1) # N * init_deep_size
self.y_deep = tf.nn.relu(tf.add(tf.add(self.lz, self.lp), self.weights['product-bias']))
self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
# Deep component
for i in range(0,len(self.deep_layers)):
self.y_deep = tf.add(tf.matmul(self.y_deep,self.weights["layer_%d" %i]), self.weights["bias_%d"%i])
self.y_deep = self.deep_layers_activation(self.y_deep)
self.y_deep = tf.nn.dropout(self.y_deep,self.dropout_keep_deep[i+1])
self.out = tf.add(tf.matmul(self.y_deep,self.weights['output']),self.weights['output_bias'])
# loss
if self.loss_type == "logloss":
self.out = tf.nn.sigmoid(self.out)
self.loss = tf.losses.log_loss(self.label, self.out)
elif self.loss_type == "mse":
self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
if self.optimizer_type == "adam":
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,
epsilon=1e-8).minimize(self.loss)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,
initial_accumulator_value=1e-8).minimize(self.loss)
elif self.optimizer_type == "gd":
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
elif self.optimizer_type == "momentum":
self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(
self.loss)
#init
self.saver = tf.train.Saver()
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.sess.run(init)
# number of params
total_parameters = 0
for variable in self.weights.values():
shape = variable.get_shape()
variable_parameters = 1
for dim in shape:
variable_parameters *= dim.value
total_parameters += variable_parameters
if self.verbose > 0:
print("#params: %d" % total_parameters)
def _initialize_weights(self):
weights = dict()
#embeddings
weights['feature_embeddings'] = tf.Variable(
tf.random_normal([self.feature_size,self.embedding_size],0.0,0.01),
name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal([self.feature_size,1],0.0,1.0),name='feature_bias')
#Product Layers
if self.use_inner:
weights['product-quadratic-inner'] = tf.Variable(tf.random_normal([self.deep_init_size,self.field_size],0.0,0.01))
else:
weights['product-quadratic-outer'] = tf.Variable(
tf.random_normal([self.deep_init_size, self.embedding_size,self.embedding_size], 0.0, 0.01))
weights['product-linear'] = tf.Variable(tf.random_normal([self.deep_init_size,self.field_size,self.embedding_size],0.0,0.01))
weights['product-bias'] = tf.Variable(tf.random_normal([self.deep_init_size,],0,0,1.0))
#deep layers
num_layer = len(self.deep_layers)
input_size = self.deep_init_size
glorot = np.sqrt(2.0/(input_size + self.deep_layers[0]))
weights['layer_0'] = tf.Variable(
np.random.normal(loc=0,scale=glorot,size=(input_size,self.deep_layers[0])),dtype=np.float32
)
weights['bias_0'] = tf.Variable(
np.random.normal(loc=0,scale=glorot,size=(1,self.deep_layers[0])),dtype=np.float32
)
for i in range(1,num_layer):
glorot = np.sqrt(2.0 / (self.deep_layers[i - 1] + self.deep_layers[i]))
weights["layer_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i - 1], self.deep_layers[i])),
dtype=np.float32) # layers[i-1] * layers[i]
weights["bias_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),
dtype=np.float32) # 1 * layer[i]
glorot = np.sqrt(2.0/(input_size + 1))
weights['output'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(self.deep_layers[-1],1)),dtype=np.float32)
weights['output_bias'] = tf.Variable(tf.constant(0.01),dtype=np.float32)
return weights
def get_batch(self,Xi,Xv,y,batch_size,index):
start = index * batch_size
end = (index + 1) * batch_size
end = end if end < len(y) else len(y)
return Xi[start:end],Xv[start:end],[[y_] for y_ in y[start:end]]
# shuffle three lists simutaneously
def shuffle_in_unison_scary(self, a, b, c):
rng_state = np.random.get_state()
np.random.shuffle(a)
np.random.set_state(rng_state)
np.random.shuffle(b)
np.random.set_state(rng_state)
np.random.shuffle(c)
def predict(self, Xi, Xv,y):
"""
:param Xi: list of list of feature indices of each sample in the dataset
:param Xv: list of list of feature values of each sample in the dataset
:return: predicted probability of each sample
"""
# dummy y
feed_dict = {self.feat_index: Xi,
self.feat_value: Xv,
self.label: y,
self.dropout_keep_deep: [1.0] * len(self.dropout_dep),
self.train_phase: True}
loss = self.sess.run([self.loss], feed_dict=feed_dict)
return loss
def fit_on_batch(self,Xi,Xv,y):
feed_dict = {self.feat_index:Xi,
self.feat_value:Xv,
self.label:y,
self.dropout_keep_deep:self.dropout_dep,
self.train_phase:True}
loss,opt = self.sess.run([self.loss,self.optimizer],feed_dict=feed_dict)
return loss
def fit(self, Xi_train, Xv_train, y_train,
Xi_valid=None, Xv_valid=None, y_valid=None,
early_stopping=False, refit=False):
"""
:param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
indi_j is the feature index of feature field j of sample i in the training set
:param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
vali_j is the feature value of feature field j of sample i in the training set
vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
:param y_train: label of each sample in the training set
:param Xi_valid: list of list of feature indices of each sample in the validation set
:param Xv_valid: list of list of feature values of each sample in the validation set
:param y_valid: label of each sample in the validation set
:param early_stopping: perform early stopping or not
:param refit: refit the model on the train+valid dataset or not
:return: None
"""
has_valid = Xv_valid is not None
for epoch in range(self.epoch):
t1 = time()
self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
total_batch = int(len(y_train) / self.batch_size)
for i in range(total_batch):
Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train, self.batch_size, i)
self.fit_on_batch(Xi_batch, Xv_batch, y_batch)
if has_valid:
y_valid = np.array(y_valid).reshape((-1,1))
loss = self.predict(Xi_valid, Xv_valid, y_valid)
print("epoch",epoch,"loss",loss)
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册