Train file with labels and meta data

From: https://www.kaggle.com/chewzy/train-file-with-labels-and-meta-data

Author: ChewZY

This kernel prepares the csv file needed for another kernel : EDA - Weird Images

The csv contains:

  • Image level meta data (width, height, pixel statistics etc)
  • One-hot encoded image labels
In [1]:
import numpy as np
import pandas as pd
import PIL
import gc

from PIL import ImageOps, ImageFilter
from multiprocessing import Pool
In [2]:
def get_img_properties(img_id, path):
    im = PIL.Image.open(f'{path}{img_id}.png')
    
    width = im.size[0]
    height = im.size[1]
    
    r, g, b = im.split()
    r_arr, g_arr, b_arr = np.array(r), np.array(g), np.array(b)
    r_mean, r_std = np.mean(r_arr), np.std(r_arr)
    g_mean, g_std = np.mean(g_arr), np.std(g_arr)
    b_mean, b_std = np.mean(b_arr), np.std(b_arr)
    
    edges_arr = np.array(im.filter(ImageFilter.FIND_EDGES))
    r_edge_arr, g_edge_arr, b_edge_arr = edges_arr[:,:,0], edges_arr[:,:,1], edges_arr[:,:,2]
    r_edge_mean, r_edge_std = np.mean(r_edge_arr), np.std(r_edge_arr)
    g_edge_mean, g_edge_std = np.mean(g_edge_arr), np.std(g_edge_arr)
    b_edge_mean, b_edge_std = np.mean(b_edge_arr), np.std(b_edge_arr)
    
    hist = im.histogram()
    peak_index = np.argmax(hist)
    peak_val = np.max(hist) / (width * height) # normalize this as images have different size
    
    return np.array([width, height, \
                     r_mean, r_std, g_mean, g_std, b_mean, b_std, \
                     r_edge_mean, r_edge_std, g_edge_mean, g_edge_std, b_edge_mean, b_edge_std, \
                     peak_index, peak_val])
In [3]:
df_train = pd.read_csv('../input/train.csv')
In [4]:
meta_cols = ['width', 'height', \
             'r_mean', 'r_std', 'g_mean', 'g_std', 'b_mean', 'b_std', \
             'r_edge_mean', 'r_edge_std', 'g_edge_mean', 'g_edge_std', 'b_edge_mean', 'b_edge_std', \
             'peak_index', 'peak_val']

# allocate some memory first
for col in meta_cols:
    df_train[col] = 0
In [5]:
n_partitions = 12
n_workers = 12
train_path = '../input/train/'

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, n_partitions)
    pool = Pool(n_workers)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


def get_meta_data(data):
    
    meta_data = np.zeros((data.shape[0], 16))
    
    for index, file_id in enumerate(data['id'].values):
        meta_data[index] = get_img_properties(file_id, train_path)
    
    data[meta_cols] = meta_data
    
    return data
In [6]:
%%time

df_train = parallelize_dataframe(df_train, get_meta_data)
CPU times: user 856 ms, sys: 664 ms, total: 1.52 s
Wall time: 11min 24s
In [7]:
label_df = pd.read_csv('../input/labels.csv')
label_names = label_df['attribute_name'].values
In [8]:
%%time
train_labels = np.zeros((df_train.shape[0], len(label_names)))

for row_index, row in enumerate(df_train['attribute_ids']):
    for label in row.split():
        train_labels[row_index, int(label)] = 1
CPU times: user 320 ms, sys: 364 ms, total: 684 ms
Wall time: 683 ms
In [9]:
for col in label_names:
    df_train[col] = 0
In [10]:
gc.collect()
Out[10]:
431
In [11]:
%%time

df_train[label_names] = train_labels
CPU times: user 3min 16s, sys: 4min 45s, total: 8min 2s
Wall time: 8min 2s
In [12]:
df_train.head()
Out[12]:
id attribute_ids width height r_mean r_std g_mean g_std b_mean b_std r_edge_mean r_edge_std g_edge_mean g_edge_std b_edge_mean b_edge_std peak_index peak_val culture::abruzzi culture::achaemenid culture::aegean culture::afghan culture::after british culture::after german culture::after german original culture::after italian culture::after russian original culture::akkadian culture::alexandria-hadra culture::algerian culture::alsace culture::american culture::american or european culture::amsterdam culture::ansbach culture::antwerp culture::apulian culture::arabian culture::aragon culture::arica ... tag::vegetables tag::venus tag::vestments tag::vests tag::victory tag::villages tag::vines tag::violas tag::violins tag::virgin mary tag::vishnu tag::volcanoes tag::vulcan tag::wagons tag::walking tag::wars tag::washing tag::watches tag::waterfalls tag::watermills tag::waves tag::weapons tag::weights and measures tag::wells tag::wind tag::windmills tag::windows tag::wine tag::winter tag::women tag::working tag::world war i tag::worshiping tag::wreaths tag::writing tag::writing implements tag::writing systems tag::zeus tag::zigzag pattern tag::zodiac
0 1000483014d91860 147 616 813 339.0 300.0 193.991740 47.468273 182.179961 44.824124 169.927552 42.199740 64.077276 91.260457 63.554710 90.883635 63.069469 90.499120 707.0 0.043609 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 1000fe2e667721fe 51 616 734 813 423.0 300.0 211.164563 47.995696 202.685390 47.376967 195.788723 47.269632 56.946785 88.436781 56.887746 88.284080 56.927352 88.050447 235.0 0.048597 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 1001614cb89646ee 776 365.0 300.0 149.338986 59.436609 130.213662 61.325377 120.461196 56.884549 46.891087 80.245607 46.813242 79.913449 46.527963 79.297113 581.0 0.017114 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 10041eb49b297c08 51 671 698 813 1092 300.0 358.0 191.176155 53.101640 175.126024 53.289562 161.795326 52.821732 58.894004 88.164013 58.830549 88.092523 58.747598 87.832953 239.0 0.031480 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 100501c227f8beea 13 404 492 903 1093 300.0 528.0 155.093504 52.570650 134.016162 52.261422 112.345625 49.316927 21.012077 41.658056 20.801938 41.005110 20.511275 40.086849 647.0 0.015789 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
In [13]:
df_train.to_csv('weird_images_w_labels.csv', index=False)