EDA-classes(most frequent and least frequent)

From: https://www.kaggle.com/jionie/eda-classes-most-frequent-and-least-frequent

Author: jionie

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

import numpy as np
import pandas as pd
import os
import copy
import sys
from PIL import Image
import time 
from tqdm.autonotebook import tqdm
import random
import gc
import cv2
import scipy
import math
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import fbeta_score

import torch
from torch.utils.data import TensorDataset, DataLoader,Dataset
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.optim.optimizer import Optimizer
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR, _LRScheduler

# Any results you write to the current directory are saved as output.
['labels.csv', 'train', 'test', 'train.csv', 'sample_submission.csv']
/opt/conda/lib/python3.6/site-packages/tqdm/autonotebook/__init__.py:14: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
  " (e.g. in jupyter console)", TqdmExperimentalWarning)

We get all classes of labels

In [2]:
import scipy.special

SEED = 42
base_dir = '../input/'
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYHTONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(SEED)
In [3]:
train_df = pd.read_csv('../input/train.csv')
labels_df = pd.read_csv('../input/labels.csv')
test_df = pd.read_csv('../input/sample_submission.csv')

tr, val = train_test_split(train_df['id'], test_size=0.15, random_state=SEED)

img_class_dict = {k:v for k, v in zip(train_df.id, train_df.attribute_ids)}

def get_label(attribute_ids):
    attribute_ids = attribute_ids.split()
    one_hot = np.zeros(1103, dtype=np.int)
    for _,ids in enumerate(attribute_ids):
        one_hot[int(ids)] = 1
    return one_hot
In [4]:
print(train_df.columns)
print(labels_df.columns)
Index(['id', 'attribute_ids'], dtype='object')
Index(['attribute_id', 'attribute_name'], dtype='object')
In [5]:
classes =train_df['attribute_ids'].value_counts().to_frame().reset_index()
classes.rename(columns={'index': 'classes', 'attribute_ids':'counts'}, inplace=True)
In [6]:
print(classes)
                           classes  counts
0                  13 405 896 1092    1158
1                          813 896     586
2                         194 1034     489
3                           13 552     482
4                         121 1059     465
5                          121 433     425
6                           13 626     365
7                          79 1059     352
8                       13 813 896     339
9                         121 1039     332
10                        189 1034     329
11                        896 1092     328
12                         121 962     290
13                         79 1062     261
14                147 671 780 1034     245
15                        1034 369     241
16                            1059     234
17                        188 1034     230
18                        194 1059     221
19                             684     221
20                     121 724 955     219
21                          79 487     218
22                     147 813 896     210
23                         304 487     201
24                             813     201
25                            1092     200
26                         147 813     195
27                     121 432 724     191
28                          13 519     188
29                     615 813 896     187
...                            ...     ...
50208              51 813 971 1092       1
50209                  294 584 671       1
50210               43 51 671 1030       1
50211     147 477 480 616 655 1092       1
50212              156 554 744 784       1
50213     189 430 462 813 974 1020       1
50214                  13 626 1092       1
50215          189 378 408 420 988       1
50216                   26 156 796       1
50217         134 501 776 1030 369       1
50218                   79 489 546       1
50219      13 480 501 512 813 1092       1
50220                      420 833       1
50221                  194 506 744       1
50222                      513 369       1
50223             480 501 737 1046       1
50224             189 325 896 1059       1
50225          51 418 932 951 1092       1
50226                  194 477 809       1
50227                 189 996 1092       1
50228                 591 896 1092       1
50229         147 835 890 953 1092       1
50230             738 885 991 1093       1
50231  40 147 477 532 671 813 1092       1
50232                  736 813 993       1
50233   156 189 813 1001 1009 1092       1
50234                          983       1
50235                  111 886 971       1
50236          480 501 548 554 744       1
50237      147 501 670 744 813 903       1

[50238 rows x 2 columns]
In [7]:
#classes['classes'] = classes['classes'].apply(get_label)
In [8]:
classes['ratio'] = classes['counts']/train_df.shape[0]
In [9]:
classes.head(10)
Out[9]:
classes counts ratio
0 13 405 896 1092 1158 0.010601
1 813 896 586 0.005364
2 194 1034 489 0.004477
3 13 552 482 0.004412
4 121 1059 465 0.004257
5 121 433 425 0.003891
6 13 626 365 0.003341
7 79 1059 352 0.003222
8 13 813 896 339 0.003103
9 121 1039 332 0.003039

WE have 50238 classes(too many) which means the samples in one class are few comparing to the whole dataset.
If we use triplet-loss an anchor may hardly find a positive sample in a batch.

Let's see the samples

In [10]:
def get_label_name(attribute_ids):
    attribute_ids = attribute_ids.split()
    attribute_name = []
    for _,ids in enumerate(attribute_ids):
        attribute_name.append(labels_df.loc[labels_df['attribute_id']==int(ids)])
    return attribute_name
In [11]:
#train_df['attribute_name'] = train_df['attribute_ids'].apply(get_label_name)
#too slow
In [12]:
train_df['count'] = train_df.groupby(['attribute_ids'])['id'].transform('count')
In [13]:
train_df = train_df.sort_values(by='attribute_ids')
In [14]:
#train_df['attribute_ids'] = train_df['attribute_ids'].apply(get_label)
In [15]:
train_df.head(30)
Out[15]:
id attribute_ids count
67541 a482440061467a99 0 189 1034 4
58592 90c15effd2289593 0 189 1034 4
77098 b9e1937a64eec95f 0 189 1034 4
28323 4e208cdf60a7217c 0 189 1034 4
34342 5b99e1f55d2b95ac 0 189 420 477 768 1
33129 58f39a1a04a2fc45 0 189 420 768 1
50740 7f5c4771b93e0e74 0 189 596 768 1
94731 e0c972e371423d82 0 189 671 768 1
55744 8a4c98c54673fb88 0 189 738 768 1
97389 e68fe4d1fe3230ef 0 189 768 7
86149 cdfd973d6398e9aa 0 189 768 7
69330 a8a8cd6e6e9f557b 0 189 768 7
87351 d093900e8edf7993 0 189 768 7
25160 472012b19ebd184d 0 189 768 7
63656 9c12b5edf257adcb 0 189 768 7
6848 1ef6eb0537f46794 0 189 768 7
57099 8d707b13b3e76286 0 189 768 835 1
17242 35eded28a5615922 0 189 786 1
83276 c7b5cfeb469fe177 1 1023 1099 1
89016 d437dbf5f8bdf128 1 1039 2
108806 ff091f98a8f3caca 1 1039 2
34640 5c5397135bacb1f9 1 1061 1
78325 bc96c379b19497cc 1 420 1
82870 c6d4c2c893f50ae5 1 420 1039 1
17043 35803794e7001310 1 420 1059 1
79869 c0045b1b5adcd3cc 1 420 477 962 1
10772 27dcc3b51078a646 1 420 598 962 1
9003 2401d4404ac3300d 1 420 738 962 1
101973 f0931c78d0ff9f5 1 420 813 962 1
79518 bf53ea03ce9d231 1 420 813 962 1046 1
In [16]:
grouped_id = train_df.groupby('attribute_ids')['id']

We collect one image from one class

In [17]:
collect_image_names = {}

for key in classes['classes']:
    name = grouped_id.get_group(key).values[0]
    count = grouped_id.get_group(key).values.shape[0]
    collect_image_names[name] = count
In [18]:
import operator
sorted_collect_image_names = sorted(collect_image_names.items(), key=operator.itemgetter(1))
sorted_collect_image_names.reverse()
print(len(sorted_collect_image_names))
50238
In [19]:
print(sorted_collect_image_names[:10])
[('a110f50e41b78ec3', 1158), ('57360e0288d1fedb', 586), ('a23abedd35cd82d1', 489), ('cebadd40271212b6', 482), ('6c7d55522298bd51', 465), ('b8839855f08f733f', 425), ('dba95d98a2724839', 365), ('8476ac59cfb98dd', 352), ('9508ce44a0f3ea94', 339), ('7efe5ca99de45695', 332)]
In [20]:
image_name = sorted_collect_image_names[0][0]
attribute_ids = train_df.loc[train_df['id']==image_name]['attribute_ids'].values[0]
print(attribute_ids.split())
['13', '405', '896', '1092']

Let's see the top 10 most frequent classes (one image per class)

In [21]:
c = 1
plt.figure(figsize=[20, 20])
for idx in range(10):
    image_name = sorted_collect_image_names[idx][0]
    img = cv2.imread("../input/train/{}.png".format(image_name))[...,[2,1,0]]
    plt.subplot(5,2,c)
    plt.imshow(img)
    
    attribute_ids = train_df.loc[train_df['id']==image_name]['attribute_ids'].values[0].split()
    attribute_name = []
    for _,ids in enumerate(attribute_ids):
        attribute_name.append(labels_df.loc[labels_df['attribute_id']==int(ids)]['attribute_name'].values[0])
    plt.title("train image {} count {}".format(attribute_name, sorted_collect_image_names[idx][1]))
    c += 1
plt.show()

Let's see the top 10 least frequent classes (one image per class)

In [22]:
c = 1
plt.figure(figsize=[20,20])

size = len(sorted_collect_image_names)

for idx in range(size-10, size):
    image_name = sorted_collect_image_names[idx][0]
    img = cv2.imread("../input/train/{}.png".format(image_name))[...,[2,1,0]]
    plt.subplot(5,2,c)
    plt.imshow(img)
    
    attribute_ids = train_df.loc[train_df['id']==image_name]['attribute_ids'].values[0].split()
    attribute_name = []
    for _,ids in enumerate(attribute_ids):
        attribute_name.append(labels_df.loc[labels_df['attribute_id']==int(ids)]['attribute_name'].values[0])
    plt.title("train image {} count {}".format(attribute_name, sorted_collect_image_names[idx][1]))
    c += 1
plt.show()

Let's see 10 images from the most frequent class (one class)

the most frequent class is 'culture::american'+'tag::actresses'+'tag::portaits'+'tag::women'

In [23]:
name = grouped_id.get_group(classes['classes'][0]).values[0]
count = grouped_id.get_group(classes['classes'][0]).values.shape[0]
In [24]:
c = 1
plt.figure(figsize=[20,20])

most_frequent_class_top_10 = {}

for i in range(10):
    name = grouped_id.get_group(classes['classes'][0]).values[i]
    count = grouped_id.get_group(classes['classes'][0]).values.shape[0]
    most_frequent_class_top_10[name] = count

size = len(most_frequent_class_top_10)

for element in most_frequent_class_top_10:
    image_name = element
    img = cv2.imread("../input/train/{}.png".format(image_name))[...,[2,1,0]]
    plt.subplot(5,2,c)
    plt.imshow(img)
    
    attribute_ids = train_df.loc[train_df['id']==image_name]['attribute_ids'].values[0].split()
    attribute_name = []
    for _,ids in enumerate(attribute_ids):
        attribute_name.append(labels_df.loc[labels_df['attribute_id']==int(ids)]['attribute_name'].values[0])
    plt.title("train image {} count {}".format(attribute_name, most_frequent_class_top_10[element]))
    c += 1
plt.show()

Let's get sorted categories count

In [25]:
category_count = {}

for i in range(1103):
    category_count[i] = 0
In [26]:
for key in classes['classes']:
    category_name = key.split()
    count = grouped_id.get_group(key).values.shape[0]
    for element in category_name:
        category_count[int(element)] += count
In [27]:
sorted_category_count = sorted(category_count.items(), key=operator.itemgetter(1))
sorted_category_count.reverse()
In [28]:
sorted_category_count_frame = pd.DataFrame.from_dict(sorted_category_count)
sorted_category_count_frame.columns=['attribute_id', 'count']
sorted_category_count_frame['ratio'] = sorted_category_count_frame['count']/train_df.shape[0]
In [29]:
sorted_category_count_frame.head(30)
Out[29]:
attribute_id count ratio
0 813 19970 0.182814
1 1092 14281 0.130734
2 147 13522 0.123786
3 189 10375 0.094977
4 13 9151 0.083772
5 671 8419 0.077071
6 51 7615 0.069711
7 194 7394 0.067688
8 1059 6564 0.060090
9 121 6542 0.059888
10 896 5955 0.054514
11 1046 5591 0.051182
12 79 5382 0.049269
13 780 5259 0.048143
14 156 5163 0.047264
15 369 4416 0.040426
16 744 3890 0.035611
17 477 3692 0.033798
18 738 3665 0.033551
19 1034 3570 0.032681
20 188 3500 0.032040
21 835 3005 0.027509
22 903 2552 0.023362
23 420 2548 0.023325
24 1099 2327 0.021302
25 552 2180 0.019957
26 485 2097 0.019197
27 776 2075 0.018995
28 161 2050 0.018767
29 489 2045 0.018721
In [30]:
category_name_count = {}

for element in sorted_category_count:
    key = element[0]
    name = labels_df[labels_df['attribute_id']==key]['attribute_name'].values[0]
    category_name_count[name] = element[1]
In [31]:
sorted_category_name_count = sorted(category_name_count.items(), key=operator.itemgetter(1))
sorted_category_name_count.reverse()
In [32]:
sorted_sorted_category_name_count_frame = pd.DataFrame.from_dict(sorted_category_name_count)
sorted_sorted_category_name_count_frame.columns=['attribute_name', 'count']
sorted_sorted_category_name_count_frame['ratio'] = sorted_sorted_category_name_count_frame['count']/train_df.shape[0]
In [33]:
sorted_sorted_category_name_count_frame.head(30)
Out[33]:
attribute_name count ratio
0 tag::men 19970 0.182814
1 tag::women 14281 0.130734
2 culture::french 13522 0.123786
3 culture::italian 10375 0.094977
4 culture::american 9151 0.083772
5 tag::flowers 8419 0.077071
6 culture::british 7615 0.069711
7 culture::japan 7394 0.067688
8 tag::utilitarian objects 6564 0.060090
9 culture::egyptian 6542 0.059888
10 tag::portraits 5955 0.054514
11 tag::trees 5591 0.051182
12 culture::china 5382 0.049269
13 tag::leaves 5259 0.048143
14 culture::german 5163 0.047264
15 culture::turkish or venice 4416 0.040426
16 tag::inscriptions 3890 0.035611
17 tag::birds 3692 0.033798
18 tag::human figures 3665 0.033551
19 tag::textile fragments 3570 0.032681
20 culture::islamic 3500 0.032040
21 tag::mythical creatures 3005 0.027509
22 tag::profiles 2552 0.023362
23 tag::animals 2548 0.023325
24 tag::writing systems 2327 0.021302
25 tag::clothing and accessories 2180 0.019957
26 tag::books 2097 0.019197
27 tag::landscapes 2075 0.018995
28 culture::greek 2050 0.018767
29 tag::bowls 2045 0.018721

We can see that the most frequent categories are 'tag::men'(ratio 0.18), 'tag::women'(ratio 0.13), 'tag::flowers'(ratio 0.07), 'culture::...'(...)

In [34]:
sorted_category_count_frame.to_csv('sorted_category_count_frame.csv', index=False)