exploration of data of iMet step by step

From: https://www.kaggle.com/bigswimatom/exploration-of-data-of-imet-step-by-step

Author: Katsunori Nakai

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))
['labels.csv', 'train', 'test', 'train.csv', 'sample_submission.csv']
In [2]:
# View
import matplotlib.pyplot as plt
from PIL import Image, ImageFilter
%matplotlib inline
In [3]:
from collections import Counter
In [4]:
# Load tabular data
train_df = pd.read_csv("../input/train.csv", index_col=0)
labels_df = pd.read_csv("../input/labels.csv", index_col=0)
sample_df = pd.read_csv("../input/sample_submission.csv", index_col=0)

Describe

In [5]:
train_df.describe()
Out[5]:
attribute_ids
count 109237
unique 50238
top 13 405 896 1092
freq 1158
In [6]:
labels_df.describe()
Out[6]:
attribute_name
count 1103
unique 1103
top tag::robes
freq 1
In [7]:
sample_df.describe()
Out[7]:
attribute_ids
count 7443
unique 1
top 0 1 2
freq 7443

ABC analysis

https://en.wikipedia.org/wiki/ABC_analysis

Too many labels are not appropriate for kernel competition. One of the way is to classify by using only A rank.

In [8]:
flatten = lambda x: [z for y in x for z in (flatten(y) if hasattr(y, '__iter__') and not isinstance(y, str) else (y, ))]
In [9]:
attribute_dist = pd.Series(flatten(list(train_df["attribute_ids"].map(lambda x: x.split(" "))))).value_counts()
attribute_dist = pd.DataFrame(attribute_dist, columns=["Count"])
attribute_dist = attribute_dist.reset_index()
attribute_dist.columns = ["attribute_id", "Count"]
attribute_dist["attribute_name"] = attribute_dist["attribute_id"].map(lambda x: labels_df.loc[int(x)].values[0])
attribute_dist["ratio"] = attribute_dist["Count"] / attribute_dist["Count"].sum()
attribute_dist["cumsum"] = attribute_dist["ratio"].cumsum()
attribute_dist.columns = ["attribute_id", "Count", "attribute_name", "ratio", "cumsum"]
attribute_dist = attribute_dist[["attribute_id", "attribute_name", "Count", "ratio", "cumsum"]]
In [10]:
attribute_dist
Out[10]:
attribute_id attribute_name Count ratio cumsum
0 813 tag::men 19970 0.057614 0.057614
1 1092 tag::women 14281 0.041201 0.098815
2 147 culture::french 13522 0.039011 0.137826
3 189 culture::italian 10375 0.029932 0.167758
4 13 culture::american 9151 0.026401 0.194159
5 671 tag::flowers 8419 0.024289 0.218448
6 51 culture::british 7615 0.021969 0.240417
7 194 culture::japan 7394 0.021332 0.261749
8 1059 tag::utilitarian objects 6564 0.018937 0.280687
9 121 culture::egyptian 6542 0.018874 0.299560
10 896 tag::portraits 5955 0.017180 0.316741
11 1046 tag::trees 5591 0.016130 0.332871
12 79 culture::china 5382 0.015527 0.348398
13 780 tag::leaves 5259 0.015172 0.363570
14 156 culture::german 5163 0.014895 0.378466
15 369 culture::turkish or venice 4416 0.012740 0.391206
16 744 tag::inscriptions 3890 0.011223 0.402429
17 477 tag::birds 3692 0.010651 0.413080
18 738 tag::human figures 3665 0.010574 0.423654
19 1034 tag::textile fragments 3570 0.010300 0.433953
20 188 culture::islamic 3500 0.010098 0.444051
21 835 tag::mythical creatures 3005 0.008669 0.452720
22 903 tag::profiles 2552 0.007363 0.460083
23 420 tag::animals 2548 0.007351 0.467434
24 1099 tag::writing systems 2327 0.006713 0.474147
25 552 tag::clothing and accessories 2180 0.006289 0.480437
26 485 tag::books 2097 0.006050 0.486487
27 776 tag::landscapes 2075 0.005986 0.492473
28 161 culture::greek 2050 0.005914 0.498387
29 489 tag::bowls 2045 0.005900 0.504287
... ... ... ... ... ...
1073 372 culture::united states 3 0.000009 0.999867
1074 39 culture::beautiran 3 0.000009 0.999876
1075 329 culture::smyrna 3 0.000009 0.999885
1076 7 culture::after italian 3 0.000009 0.999893
1077 240 culture::moche-wari 2 0.000006 0.999899
1078 987 tag::slavery 2 0.000006 0.999905
1079 142 culture::for russian market 2 0.000006 0.999911
1080 904 tag::prostitutes 2 0.000006 0.999916
1081 201 culture::konigsberg 2 0.000006 0.999922
1082 187 culture::isin-larsaold babylonian 2 0.000006 0.999928
1083 71 culture::central highlands 2 0.000006 0.999934
1084 312 culture::san sabastian 2 0.000006 0.999939
1085 271 culture::northwest china/eastern central asia 2 0.000006 0.999945
1086 108 culture::devonshire 2 0.000006 0.999951
1087 389 culture::vulci 2 0.000006 0.999957
1088 281 culture::palermo 1 0.000003 0.999960
1089 293 culture::populonia 1 0.000003 0.999962
1090 805 tag::mark antony 1 0.000003 0.999965
1091 396 culture::zoroastrian 1 0.000003 0.999968
1092 146 culture::freiburg im breisgau 1 0.000003 0.999971
1093 81 culture::chinese with european decoration 1 0.000003 0.999974
1094 11 culture::algerian 1 0.000003 0.999977
1095 230 culture::mennecy or sceaux 1 0.000003 0.999980
1096 199 culture::kholmogory 1 0.000003 0.999983
1097 262 culture::nimes 1 0.000003 0.999986
1098 104 culture::dehua 1 0.000003 0.999988
1099 112 culture::dyak 1 0.000003 0.999991
1100 221 culture::macedonian 1 0.000003 0.999994
1101 328 culture::skyros 1 0.000003 0.999997
1102 366 culture::tsimshian 1 0.000003 1.000000

1103 rows × 5 columns

In [11]:
plt.figure(figsize=(16, 7))
plt.plot(attribute_dist["cumsum"])
plt.xlabel("attribute_dist")
plt.ylabel("cumsum")
Out[11]:
Text(0, 0.5, 'cumsum')
In [12]:
# Rank A
rank_A = attribute_dist[attribute_dist["cumsum"] <= 0.7]
# Rank B
rank_B = attribute_dist[(attribute_dist["cumsum"] > 0.7) & (attribute_dist["cumsum"] <= 0.9)]
# Rank C
rank_C = attribute_dist[attribute_dist["cumsum"] > 0.9]
In [13]:
len(rank_A), len(rank_B), len(rank_C)
Out[13]:
(86, 218, 799)
In [14]:
attribute_dist.to_csv("attribute_distribution.csv")
attribute_dist
Out[14]:
attribute_id attribute_name Count ratio cumsum
0 813 tag::men 19970 0.057614 0.057614
1 1092 tag::women 14281 0.041201 0.098815
2 147 culture::french 13522 0.039011 0.137826
3 189 culture::italian 10375 0.029932 0.167758
4 13 culture::american 9151 0.026401 0.194159
5 671 tag::flowers 8419 0.024289 0.218448
6 51 culture::british 7615 0.021969 0.240417
7 194 culture::japan 7394 0.021332 0.261749
8 1059 tag::utilitarian objects 6564 0.018937 0.280687
9 121 culture::egyptian 6542 0.018874 0.299560
10 896 tag::portraits 5955 0.017180 0.316741
11 1046 tag::trees 5591 0.016130 0.332871
12 79 culture::china 5382 0.015527 0.348398
13 780 tag::leaves 5259 0.015172 0.363570
14 156 culture::german 5163 0.014895 0.378466
15 369 culture::turkish or venice 4416 0.012740 0.391206
16 744 tag::inscriptions 3890 0.011223 0.402429
17 477 tag::birds 3692 0.010651 0.413080
18 738 tag::human figures 3665 0.010574 0.423654
19 1034 tag::textile fragments 3570 0.010300 0.433953
20 188 culture::islamic 3500 0.010098 0.444051
21 835 tag::mythical creatures 3005 0.008669 0.452720
22 903 tag::profiles 2552 0.007363 0.460083
23 420 tag::animals 2548 0.007351 0.467434
24 1099 tag::writing systems 2327 0.006713 0.474147
25 552 tag::clothing and accessories 2180 0.006289 0.480437
26 485 tag::books 2097 0.006050 0.486487
27 776 tag::landscapes 2075 0.005986 0.492473
28 161 culture::greek 2050 0.005914 0.498387
29 489 tag::bowls 2045 0.005900 0.504287
... ... ... ... ... ...
1073 372 culture::united states 3 0.000009 0.999867
1074 39 culture::beautiran 3 0.000009 0.999876
1075 329 culture::smyrna 3 0.000009 0.999885
1076 7 culture::after italian 3 0.000009 0.999893
1077 240 culture::moche-wari 2 0.000006 0.999899
1078 987 tag::slavery 2 0.000006 0.999905
1079 142 culture::for russian market 2 0.000006 0.999911
1080 904 tag::prostitutes 2 0.000006 0.999916
1081 201 culture::konigsberg 2 0.000006 0.999922
1082 187 culture::isin-larsaold babylonian 2 0.000006 0.999928
1083 71 culture::central highlands 2 0.000006 0.999934
1084 312 culture::san sabastian 2 0.000006 0.999939
1085 271 culture::northwest china/eastern central asia 2 0.000006 0.999945
1086 108 culture::devonshire 2 0.000006 0.999951
1087 389 culture::vulci 2 0.000006 0.999957
1088 281 culture::palermo 1 0.000003 0.999960
1089 293 culture::populonia 1 0.000003 0.999962
1090 805 tag::mark antony 1 0.000003 0.999965
1091 396 culture::zoroastrian 1 0.000003 0.999968
1092 146 culture::freiburg im breisgau 1 0.000003 0.999971
1093 81 culture::chinese with european decoration 1 0.000003 0.999974
1094 11 culture::algerian 1 0.000003 0.999977
1095 230 culture::mennecy or sceaux 1 0.000003 0.999980
1096 199 culture::kholmogory 1 0.000003 0.999983
1097 262 culture::nimes 1 0.000003 0.999986
1098 104 culture::dehua 1 0.000003 0.999988
1099 112 culture::dyak 1 0.000003 0.999991
1100 221 culture::macedonian 1 0.000003 0.999994
1101 328 culture::skyros 1 0.000003 0.999997
1102 366 culture::tsimshian 1 0.000003 1.000000

1103 rows × 5 columns

To Be Continued