IMet-EDA

From: https://www.kaggle.com/mariammohamed/imet-eda

Author: Mariam Mohamed

In [1]:
import numpy as np 
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import keras
Using TensorFlow backend.
In [2]:
%matplotlib inline
In [3]:
labels_data = pd.read_csv('../input/labels.csv')
In [4]:
labels_data.shape
Out[4]:
(1103, 2)
In [5]:
labels_data.head()
Out[5]:
attribute_id attribute_name
0 0 culture::abruzzi
1 1 culture::achaemenid
2 2 culture::aegean
3 3 culture::afghan
4 4 culture::after british
In [6]:
labels_data.tail()
Out[6]:
attribute_id attribute_name
1098 1098 tag::writing implements
1099 1099 tag::writing systems
1100 1100 tag::zeus
1101 1101 tag::zigzag pattern
1102 1102 tag::zodiac
In [7]:
labels_data['class_name'] = labels_data['attribute_name'].apply(lambda x: x.split('::')[0])
labels_data['subclass_name'] = labels_data['attribute_name'].apply(lambda x: x.split('::')[1])
In [8]:
labels_data.class_name.unique()
Out[8]:
array(['culture', 'tag'], dtype=object)
In [9]:
sns.countplot(labels_data.class_name)
plt.xlabel('Class Name')
plt.ylabel('Number of subclasses')
plt.text(x=0, y=300, s=str(labels_data[labels_data.class_name=='culture'].shape[0]), color='white', horizontalalignment='center', size='large', weight='bold')
plt.text(x=1, y=600, s=str(labels_data[labels_data.class_name=='tag'].shape[0]), color='white', horizontalalignment='center', size='large', weight='bold')
plt.show()
In [10]:
classes_ordered = labels_data.class_name.apply(lambda x: x=='tag')
plt.plot(list(range(labels_data.shape[0])), classes_ordered)
plt.show()

It seems that they are ordered. The culture is placed at the tob then the tag.

In [11]:
print('Number of unique attributes = {0}'.format(len(labels_data.subclass_name.unique())))
Number of unique attributes = 1103
In [12]:
train_data = pd.read_csv('../input/train.csv')
In [13]:
train_data.shape
Out[13]:
(109237, 2)
In [14]:
train_data.head()
Out[14]:
id attribute_ids
0 1000483014d91860 147 616 813
1 1000fe2e667721fe 51 616 734 813
2 1001614cb89646ee 776
3 10041eb49b297c08 51 671 698 813 1092
4 100501c227f8beea 13 404 492 903 1093

Lets see:

  1. can an image belong to two categories (culture & tag)?
  2. How are the classes distributed? How many images per class? How many classes can an image belong to?
In [15]:
num_images_in_classes = [0]*labels_data.shape[0]
def count(record):
    classes = record.split()
    for class_ in classes:
        num_images_in_classes[int(class_)]+=1

train_data['attribute_ids'].apply(lambda x: count(x));
In [16]:
plt.plot(list(range(labels_data.shape[0])), num_images_in_classes)
plt.xlabel('Class')
plt.ylabel('Number of images')
plt.show()

The dataset is biased. Some classes have many images while other has very few images.

In [17]:
len_classes_images = train_data.attribute_ids.apply(lambda x: len(x.split()))
In [18]:
print('Max number of classes for one image = {0}, Min number of classes for one image = {1}'.format(max(len_classes_images), min(len_classes_images)))
Max number of classes for one image = 11, Min number of classes for one image = 1
In [19]:
sns.countplot(len_classes_images)
plt.xticks(list(range(1, 12)))
plt.xlabel('Number of classes for one image')
plt.ylabel('Number of images')
plt.show()
In [20]:
tags = train_data.attribute_ids.apply(lambda x: list(map(lambda y: int(y) >= 398, x.split())))
In [21]:
all_tags = list(filter(lambda x: len(x) == np.sum(x), tags))
all_culture = list(filter(lambda x: np.sum(x) == 0, tags))
print('Number of images with the culture tag only = {0}'.format(len(all_culture)))
print('Number of images without the culture tag = {0}'.format(len(all_tags)))
print('Number of images with culture and other tags = {0}'.format(train_data.shape[0]-len(all_tags)-len(all_culture)))
Number of images with the culture tag only = 220
Number of images without the culture tag = 11872
Number of images with culture and other tags = 97145

To be continued