Merge pull request #7 from O2Dyokii/rs

PNN-demo

Merge pull request #7 from O2Dyokii/rs
PNN-demo
c6331afa · O2Dyokii · GitHub · 5f92ed93 · a4b21f81 · c6331afa
6 changed file
--- a/RS-tf/PNN/config.py
+++ b/RS-tf/PNN/config.py
+RAIN_FILE = "data/train.csv"
+TEST_FILE = "data/test.csv"
+
+SUB_DIR = "output"
+
+
+NUM_SPLITS = 3
+RANDOM_SEED = 2017
+
+# types of columns of the dataset dataframe
+CATEGORICAL_COLS = [
+    # 'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat',
+    # 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
+    # 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
+    # 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
+    # 'ps_car_10_cat', 'ps_car_11_cat',
+]
+
+NUMERIC_COLS = [
+    # # binary
+    # "ps_ind_06_bin", "ps_ind_07_bin", "ps_ind_08_bin",
+    # "ps_ind_09_bin", "ps_ind_10_bin", "ps_ind_11_bin",
+    # "ps_ind_12_bin", "ps_ind_13_bin", "ps_ind_16_bin",
+    # "ps_ind_17_bin", "ps_ind_18_bin",
+    # "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
+    # "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin",
+    # numeric
+    "ps_reg_01", "ps_reg_02", "ps_reg_03",
+    "ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",
+
+    # feature engineering
+    "missing_feat", "ps_car_13_x_ps_reg_03",
+]
+
+IGNORE_COLS = [
+    "id", "target",
+    "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
+    "ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08",
+    "ps_calc_09", "ps_calc_10", "ps_calc_11", "ps_calc_12",
+    "ps_calc_13", "ps_calc_14",
+    "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
+    "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin"
+]
\ No newline at end of file
--- a/RS-tf/PNN/data/test.csv
+++ b/RS-tf/PNN/data/test.csv
--- a/RS-tf/PNN/data/train.csv
+++ b/RS-tf/PNN/data/train.csv
--- a/RS-tf/PNN/load_data.py
+++ b/RS-tf/PNN/load_data.py
+import pandas as pd
+
+class FeatureDictionary(object):
+    def __init__(self,trainfile=None,testfile=None,
+                 dfTrain=None,dfTest=None,numeric_cols=[],
+                 ignore_cols=[]):
+        assert not ((trainfile is None) and (dfTrain is None)), "trainfile or dfTrain at least one is set"
+        assert not ((trainfile is not None) and (dfTrain is not None)), "only one can be set"
+        assert not ((testfile is None) and (dfTest is None)), "testfile or dfTest at least one is set"
+        assert not ((testfile is not None) and (dfTest is not None)), "only one can be set"
+
+        self.trainfile = trainfile
+        self.testfile = testfile
+        self.dfTrain = dfTrain
+        self.dfTest = dfTest
+        self.numeric_cols = numeric_cols
+        self.ignore_cols = ignore_cols
+        self.gen_feat_dict()
+
+
+
+
+    def gen_feat_dict(self):
+        if self.dfTrain is None:
+            dfTrain = pd.read_csv(self.trainfile)
+
+        else:
+            dfTrain = self.dfTrain
+
+        if self.dfTest is None:
+            dfTest = pd.read_csv(self.testfile)
+
+        else:
+            dfTest = self.dfTest
+
+        df = pd.concat([dfTrain,dfTest])
+
+        self.feat_dict = {}
+        tc = 0
+        for col in df.columns:
+            if col in self.ignore_cols:
+                continue
+            if col in self.numeric_cols:
+                self.feat_dict[col] = tc
+                tc += 1
+
+            else:
+                us = df[col].unique()
+                print(us)
+                self.feat_dict[col] = dict(zip(us,range(tc,len(us)+tc)))
+                tc += len(us)
+
+        self.feat_dim = tc
+
+
+class DataParser(object):
+    def __init__(self,feat_dict):
+        self.feat_dict = feat_dict
+
+    def parse(self,infile=None,df=None,has_label=False):
+        assert not ((infile is None) and (df is None)), "infile or df at least one is set"
+        assert not ((infile is not None) and (df is not None)), "only one can be set"
+
+
+        if infile is None:
+            dfi = df.copy()
+        else:
+            dfi = pd.read_csv(infile)
+
+        if has_label:
+            y = dfi['target'].values.tolist()
+            dfi.drop(['id','target'],axis=1,inplace=True)
+        else:
+            ids = dfi['id'].values.tolist()
+            dfi.drop(['id'],axis=1,inplace=True)
+        # dfi for feature index
+        # dfv for feature value which can be either binary (1/0) or float (e.g., 10.24)
+        dfv = dfi.copy()
+        for col in dfi.columns:
+            if col in self.feat_dict.ignore_cols:
+                dfi.drop(col,axis=1,inplace=True)
+                dfv.drop(col,axis=1,inplace=True)
+                continue
+            if col in self.feat_dict.numeric_cols:
+                dfi[col] = self.feat_dict.feat_dict[col]
+            else:
+                dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
+                dfv[col] = 1.
+
+        xi = dfi.values.tolist()
+        xv = dfv.values.tolist()
+
+        if has_label:
+            return xi,xv,y
+        else:
+            return xi,xv,ids
--- a/RS-tf/PNN/main.py
+++ b/RS-tf/PNN/main.py
+import os
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from sklearn.metrics import make_scorer
+from sklearn.model_selection import StratifiedKFold
+from DataReader import FeatureDictionary, DataParser
+from matplotlib import pyplot as plt
+
+import config
+from model import PNN
+
+def load_data():
+    dfTrain = pd.read_csv(config.TRAIN_FILE)
+    dfTest = pd.read_csv(config.TEST_FILE)
+
+    def preprocess(df):
+        cols = [c for c in df.columns if c not in ['id','target']]
+        #df['missing_feat'] = np.sum(df[df[cols]==-1].values,axis=1)
+        df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
+        df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
+        return df
+
+    dfTrain = preprocess(dfTrain)
+    dfTest = preprocess(dfTest)
+
+    cols = [c for c in dfTrain.columns if c not in ['id','target']]
+    cols = [c for c in cols if (not c in config.IGNORE_COLS)]
+
+    X_train = dfTrain[cols].values
+    y_train = dfTrain['target'].values
+
+    X_test = dfTest[cols].values
+    ids_test = dfTest['id'].values
+
+    cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS]
+
+    return dfTrain,dfTest,X_train,y_train,X_test,ids_test,cat_features_indices
+
+def run_base_model_pnn(dfTrain,dfTest,folds,pnn_params):
+    fd = FeatureDictionary(dfTrain=dfTrain,
+                           dfTest=dfTest,
+                           numeric_cols=config.NUMERIC_COLS,
+                           ignore_cols = config.IGNORE_COLS)
+    data_parser = DataParser(feat_dict= fd)
+    # Xi_train ：列的序号
+    # Xv_train ：列的对应的值
+    Xi_train,Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True)
+    Xi_test,Xv_test,ids_test = data_parser.parse(df=dfTest)
+
+    print(dfTrain.dtypes)
+
+    pnn_params['feature_size'] = fd.feat_dim
+    pnn_params['field_size'] = len(Xi_train[0])
+
+
+    _get = lambda x,l:[x[i] for i in l]
+
+
+
+    for i, (train_idx, valid_idx) in enumerate(folds):
+        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
+        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)
+
+        pnn = PNN(**pnn_params)
+        pnn.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)
+
+
+
+
+
+
+
+pnn_params = {
+    "embedding_size":8,
+    "deep_layers":[32,32],
+    "dropout_deep":[0.5,0.5,0.5],
+    "deep_layer_activation":tf.nn.relu,
+    "epoch":30,
+    "batch_size":1024,
+    "learning_rate":0.001,
+    "optimizer":"adam",
+    "batch_norm":1,
+    "batch_norm_decay":0.995,
+    "verbose":True,
+    "random_seed":config.RANDOM_SEED,
+    "deep_init_size":50,
+    "use_inner":False
+
+}
+
+# load data
+dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices = load_data()
+
+# folds
+folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True,
+                             random_state=config.RANDOM_SEED).split(X_train, y_train))
+
+#y_train_pnn,y_test_pnn = run_base_model_pnn(dfTrain,dfTest,folds,pnn_params)
+y_train_pnn, y_test_pnn = run_base_model_pnn(dfTrain, dfTest, folds, pnn_params)
--- a/RS-tf/PNN/model.py
+++ b/RS-tf/PNN/model.py
+import numpy as np
+import tensorflow as tf
+
+from time import time
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.metrics import roc_auc_score
+
+class PNN(BaseEstimator, TransformerMixin):
+
+    def __init__(self, feature_size, field_size,
+                 embedding_size=8,
+                 deep_layers=[32, 32], deep_init_size = 50,
+                 dropout_deep=[0.5, 0.5, 0.5],
+                 deep_layer_activation=tf.nn.relu,
+                 epoch=10, batch_size=256,
+                 learning_rate=0.001, optimizer="adam",
+                 batch_norm=0, batch_norm_decay=0.995,
+                 verbose=False, random_seed=2016,
+                 loss_type="logloss", eval_metric=roc_auc_score,
+                greater_is_better=True,
+                 use_inner=True):
+        assert loss_type in ["logloss", "mse"], \
+            "loss_type can be either 'logloss' for classification task or 'mse' for regression task"
+
+        self.feature_size = feature_size
+        self.field_size = field_size
+        self.embedding_size = embedding_size
+
+        self.deep_layers = deep_layers
+        self.deep_init_size = deep_init_size
+        self.dropout_dep = dropout_deep
+        self.deep_layers_activation = deep_layer_activation
+
+        self.epoch = epoch
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.optimizer_type = optimizer
+
+        self.batch_norm = batch_norm
+        self.batch_norm_decay = batch_norm_decay
+
+        self.verbose = verbose
+        self.random_seed = random_seed
+        self.loss_type = loss_type
+        self.eval_metric = eval_metric
+        self.greater_is_better = greater_is_better
+        self.train_result,self.valid_result = [],[]
+
+        self.use_inner = use_inner
+
+        self._init_graph()
+
+    def _init_graph(self):
+        self.graph = tf.Graph()
+        with self.graph.as_default():
+            tf.set_random_seed(self.random_seed)
+
+            self.feat_index = tf.placeholder(tf.int32,
+                                             shape=[None,None],
+                                             name='feat_index')
+            self.feat_value = tf.placeholder(tf.float32,
+                                           shape=[None,None],
+                                           name='feat_value')
+
+            self.label = tf.placeholder(tf.float32,shape=[None,1],name='label')
+            self.dropout_keep_deep = tf.placeholder(tf.float32,shape=[None],name='dropout_deep_deep')
+            self.train_phase = tf.placeholder(tf.bool,name='train_phase')
+
+            self.weights = self._initialize_weights()
+
+            # Embeddings
+            self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'],self.feat_index) # N * F * K
+            feat_value = tf.reshape(self.feat_value,shape=[-1,self.field_size,1])
+            self.embeddings = tf.multiply(self.embeddings,feat_value) # N * F * K
+
+
+            # Linear Singal
+            linear_output = []
+            for i in range(self.deep_init_size):
+                linear_output.append(tf.reshape(
+                    tf.reduce_sum(tf.multiply(self.embeddings,self.weights['product-linear'][i]),axis=[1,2]),shape=(-1,1)))# N * 1
+
+            self.lz = tf.concat(linear_output,axis=1) # N * init_deep_size
+
+            # Quardatic Singal
+            quadratic_output = []
+            if self.use_inner:
+                for i in range(self.deep_init_size):
+                    theta = tf.multiply(self.embeddings,tf.reshape(self.weights['product-quadratic-inner'][i],(1,-1,1))) # N * F * K
+                    quadratic_output.append(tf.reshape(tf.norm(tf.reduce_sum(theta,axis=1),axis=1),shape=(-1,1))) # N * 1
+
+            else:
+                embedding_sum = tf.reduce_sum(self.embeddings,axis=1)
+                p = tf.matmul(tf.expand_dims(embedding_sum,2),tf.expand_dims(embedding_sum,1)) # N * K * K
+                for i in range(self.deep_init_size):
+                    theta = tf.multiply(p,tf.expand_dims(self.weights['product-quadratic-outer'][i],0)) # N * K * K
+                    quadratic_output.append(tf.reshape(tf.reduce_sum(theta,axis=[1,2]),shape=(-1,1))) # N * 1
+
+            self.lp = tf.concat(quadratic_output,axis=1) # N * init_deep_size
+
+            self.y_deep = tf.nn.relu(tf.add(tf.add(self.lz, self.lp), self.weights['product-bias']))
+            self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
+
+
+            # Deep component
+            for i in range(0,len(self.deep_layers)):
+                self.y_deep = tf.add(tf.matmul(self.y_deep,self.weights["layer_%d" %i]), self.weights["bias_%d"%i])
+                self.y_deep = self.deep_layers_activation(self.y_deep)
+                self.y_deep = tf.nn.dropout(self.y_deep,self.dropout_keep_deep[i+1])
+
+
+
+            self.out = tf.add(tf.matmul(self.y_deep,self.weights['output']),self.weights['output_bias'])
+
+            # loss
+            if self.loss_type == "logloss":
+                self.out = tf.nn.sigmoid(self.out)
+                self.loss = tf.losses.log_loss(self.label, self.out)
+            elif self.loss_type == "mse":
+                self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
+
+
+
+            if self.optimizer_type == "adam":
+                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,
+                                                        epsilon=1e-8).minimize(self.loss)
+            elif self.optimizer_type == "adagrad":
+                self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,
+                                                           initial_accumulator_value=1e-8).minimize(self.loss)
+            elif self.optimizer_type == "gd":
+                self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
+            elif self.optimizer_type == "momentum":
+                self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(
+                    self.loss)
+
+
+            #init
+            self.saver = tf.train.Saver()
+            init = tf.global_variables_initializer()
+            self.sess = tf.Session()
+            self.sess.run(init)
+
+            # number of params
+            total_parameters = 0
+            for variable in self.weights.values():
+                shape = variable.get_shape()
+                variable_parameters = 1
+                for dim in shape:
+                    variable_parameters *= dim.value
+                total_parameters += variable_parameters
+            if self.verbose > 0:
+                print("#params: %d" % total_parameters)
+
+
+
+
+
+    def _initialize_weights(self):
+        weights = dict()
+
+        #embeddings
+        weights['feature_embeddings'] = tf.Variable(
+            tf.random_normal([self.feature_size,self.embedding_size],0.0,0.01),
+            name='feature_embeddings')
+        weights['feature_bias'] = tf.Variable(tf.random_normal([self.feature_size,1],0.0,1.0),name='feature_bias')
+
+
+        #Product Layers
+        if self.use_inner:
+            weights['product-quadratic-inner'] = tf.Variable(tf.random_normal([self.deep_init_size,self.field_size],0.0,0.01))
+        else:
+            weights['product-quadratic-outer'] = tf.Variable(
+                tf.random_normal([self.deep_init_size, self.embedding_size,self.embedding_size], 0.0, 0.01))
+
+
+
+        weights['product-linear'] = tf.Variable(tf.random_normal([self.deep_init_size,self.field_size,self.embedding_size],0.0,0.01))
+        weights['product-bias'] = tf.Variable(tf.random_normal([self.deep_init_size,],0,0,1.0))
+        #deep layers
+        num_layer = len(self.deep_layers)
+        input_size = self.deep_init_size
+        glorot = np.sqrt(2.0/(input_size + self.deep_layers[0]))
+
+        weights['layer_0'] = tf.Variable(
+            np.random.normal(loc=0,scale=glorot,size=(input_size,self.deep_layers[0])),dtype=np.float32
+        )
+        weights['bias_0'] = tf.Variable(
+            np.random.normal(loc=0,scale=glorot,size=(1,self.deep_layers[0])),dtype=np.float32
+        )
+
+
+        for i in range(1,num_layer):
+            glorot = np.sqrt(2.0 / (self.deep_layers[i - 1] + self.deep_layers[i]))
+            weights["layer_%d" % i] = tf.Variable(
+                np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i - 1], self.deep_layers[i])),
+                dtype=np.float32)  # layers[i-1] * layers[i]
+            weights["bias_%d" % i] = tf.Variable(
+                np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),
+                dtype=np.float32)  # 1 * layer[i]
+
+
+        glorot = np.sqrt(2.0/(input_size + 1))
+        weights['output'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(self.deep_layers[-1],1)),dtype=np.float32)
+        weights['output_bias'] = tf.Variable(tf.constant(0.01),dtype=np.float32)
+
+
+        return weights
+
+
+    def get_batch(self,Xi,Xv,y,batch_size,index):
+        start = index * batch_size
+        end = (index + 1) * batch_size
+        end = end if end < len(y) else len(y)
+        return Xi[start:end],Xv[start:end],[[y_] for y_ in y[start:end]]
+
+    # shuffle three lists simutaneously
+    def shuffle_in_unison_scary(self, a, b, c):
+        rng_state = np.random.get_state()
+        np.random.shuffle(a)
+        np.random.set_state(rng_state)
+        np.random.shuffle(b)
+        np.random.set_state(rng_state)
+        np.random.shuffle(c)
+
+    def predict(self, Xi, Xv,y):
+        """
+        :param Xi: list of list of feature indices of each sample in the dataset
+        :param Xv: list of list of feature values of each sample in the dataset
+        :return: predicted probability of each sample
+        """
+        # dummy y
+        feed_dict = {self.feat_index: Xi,
+                     self.feat_value: Xv,
+                     self.label: y,
+                     self.dropout_keep_deep: [1.0] * len(self.dropout_dep),
+                     self.train_phase: True}
+
+        loss = self.sess.run([self.loss], feed_dict=feed_dict)
+
+        return loss
+
+
+    def fit_on_batch(self,Xi,Xv,y):
+        feed_dict = {self.feat_index:Xi,
+                     self.feat_value:Xv,
+                     self.label:y,
+                     self.dropout_keep_deep:self.dropout_dep,
+                     self.train_phase:True}
+
+        loss,opt = self.sess.run([self.loss,self.optimizer],feed_dict=feed_dict)
+
+        return loss
+
+    def fit(self, Xi_train, Xv_train, y_train,
+            Xi_valid=None, Xv_valid=None, y_valid=None,
+            early_stopping=False, refit=False):
+        """
+        :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
+                         indi_j is the feature index of feature field j of sample i in the training set
+        :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
+                         vali_j is the feature value of feature field j of sample i in the training set
+                         vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
+        :param y_train: label of each sample in the training set
+        :param Xi_valid: list of list of feature indices of each sample in the validation set
+        :param Xv_valid: list of list of feature values of each sample in the validation set
+        :param y_valid: label of each sample in the validation set
+        :param early_stopping: perform early stopping or not
+        :param refit: refit the model on the train+valid dataset or not
+        :return: None
+        """
+        has_valid = Xv_valid is not None
+        for epoch in range(self.epoch):
+            t1 = time()
+            self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
+            total_batch = int(len(y_train) / self.batch_size)
+            for i in range(total_batch):
+                Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train, self.batch_size, i)
+                self.fit_on_batch(Xi_batch, Xv_batch, y_batch)
+
+            if has_valid:
+                y_valid = np.array(y_valid).reshape((-1,1))
+                loss = self.predict(Xi_valid, Xv_valid, y_valid)
+                print("epoch",epoch,"loss",loss)
\ No newline at end of file