From: https://www.kaggle.com/mariammohamed/imet-eda
Author: Mariam Mohamed
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import keras
%matplotlib inline
labels_data = pd.read_csv('../input/labels.csv')
labels_data.shape
labels_data.head()
labels_data.tail()
labels_data['class_name'] = labels_data['attribute_name'].apply(lambda x: x.split('::')[0])
labels_data['subclass_name'] = labels_data['attribute_name'].apply(lambda x: x.split('::')[1])
labels_data.class_name.unique()
sns.countplot(labels_data.class_name)
plt.xlabel('Class Name')
plt.ylabel('Number of subclasses')
plt.text(x=0, y=300, s=str(labels_data[labels_data.class_name=='culture'].shape[0]), color='white', horizontalalignment='center', size='large', weight='bold')
plt.text(x=1, y=600, s=str(labels_data[labels_data.class_name=='tag'].shape[0]), color='white', horizontalalignment='center', size='large', weight='bold')
plt.show()
classes_ordered = labels_data.class_name.apply(lambda x: x=='tag')
plt.plot(list(range(labels_data.shape[0])), classes_ordered)
plt.show()
It seems that they are ordered. The culture is placed at the tob then the tag.
print('Number of unique attributes = {0}'.format(len(labels_data.subclass_name.unique())))
train_data = pd.read_csv('../input/train.csv')
train_data.shape
train_data.head()
Lets see:
num_images_in_classes = [0]*labels_data.shape[0]
def count(record):
classes = record.split()
for class_ in classes:
num_images_in_classes[int(class_)]+=1
train_data['attribute_ids'].apply(lambda x: count(x));
plt.plot(list(range(labels_data.shape[0])), num_images_in_classes)
plt.xlabel('Class')
plt.ylabel('Number of images')
plt.show()
The dataset is biased. Some classes have many images while other has very few images.
len_classes_images = train_data.attribute_ids.apply(lambda x: len(x.split()))
print('Max number of classes for one image = {0}, Min number of classes for one image = {1}'.format(max(len_classes_images), min(len_classes_images)))
sns.countplot(len_classes_images)
plt.xticks(list(range(1, 12)))
plt.xlabel('Number of classes for one image')
plt.ylabel('Number of images')
plt.show()
tags = train_data.attribute_ids.apply(lambda x: list(map(lambda y: int(y) >= 398, x.split())))
all_tags = list(filter(lambda x: len(x) == np.sum(x), tags))
all_culture = list(filter(lambda x: np.sum(x) == 0, tags))
print('Number of images with the culture tag only = {0}'.format(len(all_culture)))
print('Number of images without the culture tag = {0}'.format(len(all_tags)))
print('Number of images with culture and other tags = {0}'.format(train_data.shape[0]-len(all_tags)-len(all_culture)))
To be continued