iMet Data Analysis, Plus!

From: https://www.kaggle.com/hsakizero/imet-data-analysis-plus

Author: Hsaki

In [1]:
import numpy as np
import pandas as pd
import pylab as plt
import seaborn as sns
import cv2
import os

Take a Glimpse

In [2]:
train = pd.read_csv('../input/train.csv')
labels = pd.read_csv('../input/labels.csv')
test = pd.read_csv('../input/sample_submission.csv')

train.head(5)
Out[2]:
id attribute_ids
0 1000483014d91860 147 616 813
1 1000fe2e667721fe 51 616 734 813
2 1001614cb89646ee 776
3 10041eb49b297c08 51 671 698 813 1092
4 100501c227f8beea 13 404 492 903 1093
In [3]:
print('Number of train samples: ', train.shape[0])
print('Number of test samples: ', test.shape[0])
print('Number of labels: ', labels.shape[0])
Number of train samples:  109237
Number of test samples:  7443
Number of labels:  1103

Most Frequent and Infrequent Attributes

In [4]:
attribute_ids = train['attribute_ids'].values
attributes = []
for item_attributes in [x.split(' ') for x in attribute_ids]:
    for attribute in item_attributes:
        attributes.append(int(attribute))
        
att_pd = pd.DataFrame(attributes, columns=['attribute_id'])
att_pd = att_pd.merge(labels) #merge id with labels

frequent= att_pd['attribute_name'].value_counts()[:30].to_frame()

f, ax = plt.subplots(figsize=(12, 8))
ax = sns.barplot(y=frequent.index, x="attribute_name", data=frequent, palette="rocket", order=reversed(frequent.index))
ax.set_ylabel("Surface type")
ax.set_xlabel("Count")
sns.despine()
plt.title('Most frequent attributes')
plt.show()     
In [5]:
infrequent= att_pd['attribute_name'].value_counts(ascending=True)[:15].to_frame()
f, ax = plt.subplots(figsize=(12, 4))
ax = sns.barplot(y=infrequent.index, x="attribute_name", data=infrequent, palette="rocket", order=reversed(infrequent.index))
ax.set_ylabel("Surface type")
ax.set_xlabel("Count")
sns.despine() # remove the upper and right border
plt.title('Most infrequent attributes')
plt.show() 
In [6]:
train['Number of Tags'] = train['attribute_ids'].apply(lambda x: len(x.split(' ')))
f, ax = plt.subplots(figsize=(9, 8))
ax = sns.countplot(x="Number of Tags", data=train, palette="GnBu_d")
ax.set_ylabel("Surface type")
sns.despine()
plt.show()

Analysis of Picture Size

In [7]:
width = []
height = []
for img_name in os.listdir("../input/train/")[-500:]:
    shape = cv2.imread("../input/train/%s" % img_name).shape
    height.append(shape[0])
    width.append(shape[1])
size = pd.DataFrame({'height':height, 'width':width})
sns.jointplot("height", "width", size, kind='reg')  
plt.show()
/opt/conda/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
In [8]:
size[:10]
Out[8]:
height width
0 300 300
1 369 300
2 531 300
3 375 300
4 300 416
5 300 449
6 300 463
7 300 517
8 393 300
9 300 334

So we can see that the length of one of two sides is 300

In [9]:
print('The average height is ' + str(np.mean(size.height)))
print('The median height is ' + str(np.median(size.height)))
print('The average width is ' + str(np.mean(size.width)))
print('The median width is ' + str(np.median(size.width)))
The average height is 372.162
The median height is 300.0
The average width is 379.516
The median width is 300.0

Attributes Statistics

Pictures with Most Attributes

In [10]:
most_att = train[train['Number of Tags']>9]
least_att = train[train['Number of Tags']<2]

count = 1
plt.figure(figsize=[30,20])
for img_name in most_att['id'].values:
    img = cv2.imread("../input/train/%s.png" % img_name)
    plt.subplot(2, 3, count)
    plt.imshow(img)
    count += 1
plt.show
Out[10]:
<function matplotlib.pyplot.show(*args, **kw)>

Pictures with Least Attributes

In [11]:
count = 1
plt.figure(figsize=[30,20])
for img_name in least_att['id'].values[:6]:
    img = cv2.imread("../input/train/%s.png" % img_name)
    plt.subplot(2, 3, count)
    plt.imshow(img)
    count += 1
plt.show
Out[11]:
<function matplotlib.pyplot.show(*args, **kw)>