angtk_0190409_test1

From: https://www.kaggle.com/deeplearningzy/angtk-0190409-test1

Author: Zhangyao

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
['test', 'train', 'train.csv', 'labels.csv', 'sample_submission.csv']
In [2]:
train_img = pd.read_csv('../input/train.csv')
labels_img = pd.read_csv('../input/labels.csv')
submission = pd.read_csv('../input/sample_submission.csv')
print("Train data shape - rows:",train_img.shape[0],"columns:",train_img.shape[1])
print("labels data shape - rows:",labels_img.shape[0],"columns:",labels_img.shape[1])
print("Test data shape - roes:",submission.shape[0],"columns:",submission.shape[1])
Train data shape - rows: 109237 columns: 2
labels data shape - rows: 1103 columns: 2
Test data shape - roes: 7443 columns: 2
In [3]:
th5 = pd.DataFrame(train_img.attribute_ids.value_counts().head(5))
th5.reset_index(level=0, inplace=True)
th5.columns = ['landmark_id','count']
th5
Out[3]:
landmark_id count
0 13 405 896 1092 1158
1 813 896 586
2 194 1034 489
3 13 552 482
4 121 1059 465
In [4]:
tb5 = pd.DataFrame(train_img.attribute_ids.value_counts().tail(5))
tb5.reset_index(level=0, inplace=True)
tb5.columns = ['landmark_id','count']
tb5
Out[4]:
landmark_id count
0 43 51 483 650 668 813 1045 1
1 121 532 724 1
2 189 378 597 1034 1
3 189 418 795 832 834 879 1
4 194 813 1020 1057 1
In [5]:
# Plot the least frequent landmark occurences
plt.figure(figsize = (6,10))
plt.title('most frequent landmarks')
sns.set_color_codes("pastel")
sns.barplot(x="landmark_id", y="count", data=th5,
            label="Count", color="blue")
plt.show()
In [6]:
# Plot the least frequent landmark occurences
plt.figure(figsize = (6,10))
plt.title('Least frequent landmarks')
sns.set_color_codes("pastel")
sns.barplot(x="landmark_id", y="count", data=tb5,
            label="Count", color="orange")
plt.show()