iMetv2

From: https://www.kaggle.com/bigyellower/imetv2

Author: BigYellower

Score: 0.418

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
['keras-pretrain-model-weights', 'imet-2019-fgvc6']
In [2]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skimage.io
from skimage.transform import resize
from imgaug import augmenters as iaa
from tqdm import tqdm
import PIL
from PIL import Image
import cv2
from sklearn.utils import class_weight, shuffle
from keras.losses import binary_crossentropy
from keras.applications.resnet50 import preprocess_input
import keras.backend as K
import tensorflow as tf
from sklearn.metrics import f1_score, fbeta_score
from keras.utils import Sequence
WORKERS = 2
CHANNEL = 3

import warnings
warnings.filterwarnings("ignore")
SIZE = 256
NUM_CLASSES = 1103
beta_f2=2
Using TensorFlow backend.
In [3]:
# Load dataset info
path_to_train = '../input/imet-2019-fgvc6/train/'
data = pd.read_csv('../input/imet-2019-fgvc6/train.csv')

train_dataset_info = []
for name, labels in zip(data['id'], data['attribute_ids'].str.split(' ')):
    train_dataset_info.append({
        'path':os.path.join(path_to_train, name),
        'labels':np.array([int(label) for label in labels])})
train_dataset_info = np.array(train_dataset_info)

gamma = 1.0
epsilon = K.epsilon()
def focal_loss(y_true, y_pred):
    pt = y_pred * y_true + (1-y_pred) * (1-y_true)
    pt = K.clip(pt, epsilon, 1-epsilon)
    CE = -K.log(pt)
    FL = K.pow(1-pt, gamma) * CE
    loss = K.sum(FL, axis=1)
    return loss

sometimes = lambda aug: iaa.Sometimes(0.5, aug)

class data_generator(Sequence):
    
    def create_train(dataset_info, batch_size, shape, augument=True):
        assert shape[2] == 3
        while True:
            dataset_info = shuffle(dataset_info)
            for start in range(0, len(dataset_info), batch_size):
                end = min(start + batch_size, len(dataset_info))
                batch_images = []
                X_train_batch = dataset_info[start:end]
                batch_labels = np.zeros((len(X_train_batch), NUM_CLASSES))
                for i in range(len(X_train_batch)):
                    image = data_generator.load_image(
                        X_train_batch[i]['path'], shape)   
                    if augument:
                        image = data_generator.augment(image)
                    batch_images.append(preprocess_input(image))
                    batch_labels[i][X_train_batch[i]['labels']] = 1
                    
                yield np.array(batch_images, np.float32), batch_labels

    def create_valid(dataset_info, batch_size, shape, augument=False):
        assert shape[2] == 3
        while True:
            # dataset_info = shuffle(dataset_info)
            for start in range(0, len(dataset_info), batch_size):
                end = min(start + batch_size, len(dataset_info))
                batch_images = []
                X_train_batch = dataset_info[start:end]
                batch_labels = np.zeros((len(X_train_batch), NUM_CLASSES))
                for i in range(len(X_train_batch)):
                    image = data_generator.load_image(
                        X_train_batch[i]['path'], shape)   
                    if augument:
                        image = data_generator.augment(image)
                    batch_images.append(preprocess_input(image))
                    batch_labels[i][X_train_batch[i]['labels']] = 1
                yield np.array(batch_images, np.float32), batch_labels


    def load_image(path, shape):
        image = cv2.imread(path+'.png')
        image = cv2.resize(image, (SIZE, SIZE))
        return image

    def augment(image):
        augment_img = iaa.Sequential([
            sometimes(
            iaa.OneOf([
                # iaa.AddToHueAndSaturation((-20, 20)),
                iaa.Add((-10, 10), per_channel=0.5),
                iaa.Multiply((0.9, 1.1), per_channel=0.5),
                iaa.GaussianBlur((0, 0.5)), # blur images with a sigma between 0 and 3.0
                iaa.ContrastNormalization((0.8, 1.2), per_channel=0.5), # improve or worsen the contrast
                iaa.Sharpen(alpha=(0, 0.2), lightness=(0.8, 1.2)), # sharpen images
                iaa.Emboss(alpha=(0, 0.5), strength=(0, 0.5)), # emboss images
                # iaa.Crop(percent=(0, 0.1))
                ])
            ),
            iaa.OneOf([
#                 iaa.Affine(rotate=0),
#                 iaa.Affine(rotate=90),
#                 iaa.Affine(rotate=180),
#                 iaa.Affine(rotate=270),
                  iaa.Fliplr(0.5),
#                 iaa.Flipud(0.5),
            ])], random_order=True)

        image_aug = augment_img.augment_image(image)
        return image_aug
In [4]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, load_model
from keras.layers import (Activation, Dropout, Flatten, Dense, GlobalMaxPooling2D,
                          BatchNormalization, Input, Conv2D, GlobalAveragePooling2D)
from keras.applications.resnext import ResNeXt101
from keras.callbacks import ModelCheckpoint
from keras import metrics
from keras.optimizers import Adam 
from keras import backend as K
import keras
from keras.models import Model
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-4-00e63315925f> in <module>()
      3 from keras.layers import (Activation, Dropout, Flatten, Dense, GlobalMaxPooling2D,
      4                           BatchNormalization, Input, Conv2D, GlobalAveragePooling2D)
----> 5 from keras.applications.resnext import ResNeXt101
      6 from keras.callbacks import ModelCheckpoint
      7 from keras import metrics

ModuleNotFoundError: No module named 'keras.applications.resnext'
In [5]:
def create_model(input_shape, n_out):
    input_tensor = Input(shape=input_shape)
    base_model = ResNeXt101(include_top=False,
                   weights=None,
                   input_tensor=input_tensor)
    base_model.load_weights('../input/keras-pretrain-model-weights/resnext101_weights_tf_dim_ordering_tf_kernels_notop.h5')
#     x = Conv2D(32, kernel_size=(1,1), activation='relu')(base_model.output)
#     x = Flatten()(x)
    x = GlobalAveragePooling2D()(base_model.output)
    x = Dropout(0.5)(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.5)(x)
    final_output = Dense(n_out, activation='sigmoid', name='final_output')(x)
    model = Model(input_tensor, final_output)
    
    return model

# create callbacks list
from keras.callbacks import (ModelCheckpoint, LearningRateScheduler,
                             EarlyStopping, ReduceLROnPlateau,CSVLogger)
                             
from sklearn.model_selection import train_test_split

epochs = 30; batch_size = 24
checkpoint = ModelCheckpoint('../working/Resnet50_focal.h5', monitor='val_loss', verbose=1, 
                             save_best_only=True, mode='min', save_weights_only = True)
reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, 
                                   verbose=1, mode='auto', epsilon=0.0001)
early = EarlyStopping(monitor="val_loss", 
                      mode="min", 
                      patience=2)

csv_logger = CSVLogger(filename='../working/training_log.csv',
                       separator=',',
                       append=True)


# split data into train, valid
indexes = np.arange(train_dataset_info.shape[0])
train_indexes, valid_indexes = train_test_split(indexes, test_size=0.15, random_state=8)

# create train and valid datagens
train_generator = data_generator.create_train(
    train_dataset_info[train_indexes], batch_size, (SIZE,SIZE,3), augument=True)
train_generator_warmup = data_generator.create_train(
    train_dataset_info[train_indexes], batch_size, (SIZE,SIZE,3), augument=False)
validation_generator = data_generator.create_valid(
    train_dataset_info[valid_indexes], batch_size, (SIZE,SIZE,3), augument=False)

callbacks_list = [checkpoint, csv_logger, reduceLROnPlat,early]

# warm up model
model = create_model(
    input_shape=(SIZE,SIZE,3), 
    n_out=NUM_CLASSES)

for layer in model.layers:
    layer.trainable = False

for i in range(-5,0):
    model.layers[i].trainable = True

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(1e-3))

# model.summary()

model.fit_generator(
    train_generator_warmup,
    steps_per_epoch=np.ceil(float(len(train_indexes)) / float(batch_size)),
    epochs=5,
    max_queue_size=16, workers=WORKERS, use_multiprocessing=True,
    verbose=1)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-8557ed849037> in <module>()
     53 model = create_model(
     54     input_shape=(SIZE,SIZE,3),
---> 55     n_out=NUM_CLASSES)
     56 
     57 for layer in model.layers:

<ipython-input-5-8557ed849037> in create_model(input_shape, n_out)
      1 def create_model(input_shape, n_out):
      2     input_tensor = Input(shape=input_shape)
----> 3     base_model = ResNeXt101(include_top=False,
      4                    weights=None,
      5                    input_tensor=input_tensor)

NameError: name 'ResNeXt101' is not defined
In [6]:
# train all layers
for layer in model.layers:
    layer.trainable = True

model.compile(loss='binary_crossentropy',
            # loss=focal_loss,
            optimizer=Adam(lr=1e-4))

model.fit_generator(
    train_generator,
    steps_per_epoch=np.ceil(float(len(train_indexes)) / float(batch_size)),
    validation_data=validation_generator,
    validation_steps=np.ceil(float(len(valid_indexes)) / float(batch_size)),
    epochs=epochs,
    verbose=1,
    max_queue_size=16, workers=WORKERS, use_multiprocessing=True,
    callbacks=callbacks_list)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-d33659382059> in <module>()
      1 # train all layers
----> 2 for layer in model.layers:
      3     layer.trainable = True
      4 
      5 model.compile(loss='binary_crossentropy',

NameError: name 'model' is not defined
In [7]:
print(os.listdir('../working/'))

submit = pd.read_csv('../input/imet-2019-fgvc6/sample_submission.csv')
model.load_weights('../working/Resnet50_focal.h5')
predicted = []

'''Search for the best threshold regarding the validation set'''

BATCH = 512
fullValGen = data_generator.create_valid(
    train_dataset_info[valid_indexes], BATCH, (SIZE,SIZE,3))

n_val = round(train_dataset_info.shape[0]*0.15)//BATCH
print(n_val)

lastFullValPred = np.empty((0, NUM_CLASSES))
lastFullValLabels = np.empty((0, NUM_CLASSES))
for i in tqdm(range(n_val+1)): 
    im, lbl = next(fullValGen)
    scores = model.predict(im)
    lastFullValPred = np.append(lastFullValPred, scores, axis=0)
    lastFullValLabels = np.append(lastFullValLabels, lbl, axis=0)
print(lastFullValPred.shape, lastFullValLabels.shape)
['__notebook__.ipynb', '__output__.json']
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-765a2f15fe5f> in <module>()
      2 
      3 submit = pd.read_csv('../input/imet-2019-fgvc6/sample_submission.csv')
----> 4 model.load_weights('../working/Resnet50_focal.h5')
      5 predicted = []
      6 

NameError: name 'model' is not defined
In [8]:
def my_f2(y_true, y_pred):
    assert y_true.shape[0] == y_pred.shape[0]

    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f2 = (1+beta_f2**2)*p*r / (p*beta_f2**2 + r + 1e-15)

    return f2

def find_best_fixed_threshold(preds, targs, do_plot=True):
    score = []
    thrs = np.arange(0, 0.5, 0.01)
    for thr in tqdm(thrs):
        score.append(my_f2(targs, (preds > thr).astype(int) ))
    score = np.array(score)
    pm = score.argmax()
    best_thr, best_score = thrs[pm], score[pm].item()
    print(f'thr={best_thr:.3f}', f'F2={best_score:.3f}')
    if do_plot:
        plt.plot(thrs, score)
        plt.vlines(x=best_thr, ymin=score.min(), ymax=score.max())
        plt.text(best_thr+0.03, best_score-0.01, f'$F_{2}=${best_score:.3f}', fontsize=14);
        plt.show()
    return best_thr, best_score

best_thr, best_score = find_best_fixed_threshold(lastFullValPred, lastFullValLabels, do_plot=True)

for i, name in tqdm(enumerate(submit['id'])):
    path = os.path.join('../input/imet-2019-fgvc6/test/', name)
    image = data_generator.load_image(path, (SIZE,SIZE,3))
    score_predict = model.predict(preprocess_input(image[np.newaxis]))
    # print(score_predict)
    label_predict = np.arange(NUM_CLASSES)[score_predict[0]>=best_thr]
    # print(label_predict)
    str_predict_label = ' '.join(str(l) for l in label_predict)
    predicted.append(str_predict_label)
    
submit['attribute_ids'] = predicted
submit.to_csv('submission.csv', index=False)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-d826041bf0b9> in <module>()
     30     return best_thr, best_score
     31 
---> 32 best_thr, best_score = find_best_fixed_threshold(lastFullValPred, lastFullValLabels, do_plot=True)
     33 
     34 for i, name in tqdm(enumerate(submit['id'])):

NameError: name 'lastFullValPred' is not defined
In [9]:
print(os.listdir('../working/'))
['__notebook__.ipynb', '__output__.json']