Fast and simple EDA (sample_sub 0.00053)

From: https://www.kaggle.com/bejeweled/fast-and-simple-eda-sample-sub-0-00053

Author: Dmitriy Ershov

Score: 0.00053

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
['test', 'labels.csv', 'sample_submission.csv', 'train.csv', 'train']
In [2]:
import cv2

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
sns.set(font_scale=1)
In [3]:
df_train = pd.read_csv("../input/train.csv")
labels = pd.read_csv("../input/labels.csv")
sample_sub = pd.read_csv("../input/sample_submission.csv")
In [4]:
df_train.head()
Out[4]:
id attribute_ids
0 1000483014d91860 147 616 813
1 1000fe2e667721fe 51 616 734 813
2 1001614cb89646ee 776
3 10041eb49b297c08 51 671 698 813 1092
4 100501c227f8beea 13 404 492 903 1093
In [5]:
labels.head()
Out[5]:
attribute_id attribute_name
0 0 culture::abruzzi
1 1 culture::achaemenid
2 2 culture::aegean
3 3 culture::afghan
4 4 culture::after british
In [6]:
sample_sub.head()
Out[6]:
id attribute_ids
0 10023b2cc4ed5f68 0 1 2
1 100fbe75ed8fd887 0 1 2
2 101b627524a04f19 0 1 2
3 10234480c41284c6 0 1 2
4 1023b0e2636dcea8 0 1 2
In [7]:
df_train.shape, labels.shape, sample_sub.shape
Out[7]:
((109237, 2), (1103, 2), (7443, 2))
In [8]:
df_train.groupby("attribute_ids").size().sort_values()[::-1][:10]
Out[8]:
attribute_ids
13 405 896 1092    1158
813 896             586
194 1034            489
13 552              482
121 1059            465
121 433             425
13 626              365
79 1059             352
13 813 896          339
121 1039            332
dtype: int64
In [9]:
df_train.groupby("attribute_ids").size().sort_values()[::-1][:10].hist(bins=10)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1aae44c438>
In [10]:
print("train size is {}".format(len(os.listdir("../input/train/"))))
print("test size is {}".format(len(os.listdir("../input/test/"))))
train size is 109237
test size is 7443
In [11]:
c = 1
plt.figure(figsize=[16,16])
for img_name in os.listdir("../input/train/")[:16]:
    img = cv2.imread("../input/train/{}".format(img_name))[...,[2,1,0]]
    plt.subplot(4,4,c)
    plt.imshow(img)
    plt.title("train image {}".format(c))
    c += 1
plt.show();
In [12]:
c = 1
plt.figure(figsize=[16,16])
for img_name in os.listdir("../input/test/")[:16]:
    img = cv2.imread("../input/test/{}".format(img_name))[...,[2,1,0]]
    plt.subplot(4,4,c)
    plt.imshow(img)
    plt.title("test image {}".format(c))
    c += 1
plt.show();

Well, let's find out which score sample_submission has

In [13]:
sample_sub.to_csv("submission.csv", index=False)