EDA IMET

From: https://www.kaggle.com/saladjay/eda-imet

Author: 邓雍杰Jay

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
['test', 'train', 'train.csv', 'labels.csv', 'sample_submission.csv']
In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import gc
import os
import PIL
import glob
import cv2
import time

from scipy import stats
from multiprocessing import Pool
from PIL import ImageOps,ImageFilter,Image
from tqdm import tqdm
from wordcloud import WordCloud

tqdm.pandas()
In [3]:
train_size_df = pd.read_csv('../input/train.csv')
In [4]:
width = []
height = []
for name in tqdm(train_size_df['id']):
    img = Image.open('../input/train/'+name+'.png')
    width.append(img.size[0])
    height.append(img.size[1])
train_size_df['width'] = width
train_size_df['height'] = height
100%|██████████| 109237/109237 [01:44<00:00, 1044.63it/s]
In [5]:
train_size_df.head(5)
Out[5]:
id attribute_ids width height
0 1000483014d91860 147 616 813 339 300
1 1000fe2e667721fe 51 616 734 813 423 300
2 1001614cb89646ee 776 365 300
3 10041eb49b297c08 51 671 698 813 1092 300 358
4 100501c227f8beea 13 404 492 903 1093 300 528
In [6]:
width_height_ratio = np.zeros(len(train_size_df))
height_width_ratio = np.zeros(len(train_size_df))
for row_index,(w,h) in tqdm(enumerate(zip(train_size_df['width'],train_size_df['height']))):
    if w==300:
        times = h/w
        times = round(times,1)
        height_width_ratio[row_index] = times
    else:
        times = w/h
        times = round(times,1)
        width_height_ratio[row_index] = times
train_size_df['width_height_ratio'] = width_height_ratio
train_size_df['height_width_ratio'] = height_width_ratio
109237it [00:00, 672219.21it/s]
In [7]:
train_size_df['ratio'] = train_size_df['height_width_ratio'] + train_size_df['width_height_ratio']
In [8]:
train_size_df['ratio'].max(),train_size_df['ratio'].min(),train_size_df['ratio'].mean(),train_size_df['ratio'].quantile(q=0.25),train_size_df['ratio'].median(),train_size_df['ratio'].quantile(q=0.75)
Out[8]:
(25.1, 1.0, 1.4746789091607393, 1.2, 1.3, 1.5)
In [9]:
ratio_1_2 = train_size_df[train_size_df['ratio']<=2]
print(ratio_1_2.shape)
plt.figure(figsize=(20,8))
ax = sns.countplot(ratio_1_2['ratio'])
plt.xlabel('ratio_1_2')
plt.title('Number of image per ratio', fontsize=20)

for p in ax.patches:
    ax.annotate(f'{p.get_height()}-{p.get_height() * 100 / ratio_1_2.shape[0]:.3f}%',
            (p.get_x() + p.get_width() / 2., p.get_height()), 
            ha='center', 
            va='center', 
            fontsize=11, 
            color='black',
            xytext=(0,7), 
            textcoords='offset points')
(101365, 7)
In [10]:
ratio_section = [1.0,1.3,1.5,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,20.0,26.0]
ratio_section_count = np.zeros(len(ratio_section)-1)
w_ratio_section_count = np.zeros(len(ratio_section)-1)
h_ratio_section_count = np.zeros(len(ratio_section)-1)
for i in range(len(ratio_section_count)):
    start = ratio_section[i]
    end = ratio_section[i+1]-0.1
    ratio_section_count[i] = train_size_df['ratio'].between(start,end).sum()
    w_ratio_section_count[i] = train_size_df['width_height_ratio'].between(start,end).sum()
    h_ratio_section_count[i] = train_size_df['height_width_ratio'].between(start,end).sum()
ratio_section_count = ratio_section_count.astype(np.int64)
w_ratio_section_count = w_ratio_section_count.astype(np.int64)
h_ratio_section_count = h_ratio_section_count.astype(np.int64)
In [11]:
def draw(start,end,column='ratio'):
    temp_df = train_size_df[train_size_df[column].between(start,end)]
    temp_labels_count = np.zeros((len(temp_df),1103))
    num_temp_labels = np.zeros(len(temp_df))
    for row_index,row in enumerate(temp_df['attribute_ids']):
        ids = row.split(' ')
        num_temp_labels[row_index] = len(ids)
        for id_index in ids:
            temp_labels_count[row_index,int(id_index)] = 1
    label_sum = np.sum(temp_labels_count, axis=0)
    attributes_sequence = label_sum.argsort()[::-1]
    label_names = pd.read_csv('../input/labels.csv')
    label_names = label_names['attribute_name']
    attributes_labels = [label_names[x] for x in attributes_sequence]
    attributes_counts = [label_sum[x] for x in attributes_sequence]
    plt.figure(figsize=(20,2))

    plt.subplot()
    ax1 = sns.barplot(y=attributes_labels[:5], x=attributes_counts[:5], orient="h")
    plt.title(f'Label Counts between {start} and {end} (Top 5)',fontsize=15)
    plt.xlim((0, max(attributes_counts)*1.15))
    plt.yticks(fontsize=15)

    for p in ax1.patches:
        ax1.annotate(f'{int(p.get_width())}-{p.get_width() * 100 / temp_df.shape[0]:.2f}%',
                    (p.get_width(), p.get_y() + p.get_height() / 2.), 
                    ha='left', 
                    va='center', 
                    fontsize=10, 
                    color='black',
                    xytext=(7,0), 
                    textcoords='offset points')
    plt.show()
In [12]:
for ratio in [1.0,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2.0,2.1,2.2]:
    draw(ratio,ratio)
In [13]:
for ratio in [1.0,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2.0,2.1,2.2]:
    draw(ratio,ratio,'height_width_ratio')
In [14]:
for ratio in [1.0,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2.0,2.1,2.2]:
    draw(ratio,ratio,'width_height_ratio')
In [15]:
train_size_df.to_csv('train_size_df.csv')
In [16]:
train_hsv_df = pd.read_csv('../input/train.csv')
In [17]:
gray_img = np.zeros(len(train_hsv_df))
h_list = np.zeros(len(train_hsv_df))
s_list = np.zeros(len(train_hsv_df))
v_list = np.zeros(len(train_hsv_df))
for row,img_name in tqdm(enumerate(train_hsv_df['id'])):
    img = cv2.imread('../input/train/'+img_name+'.png')
    hsv_img = cv2.cvtColor(img,cv2.COLOR_BGR2HSV)
    h,s,v = np.average(hsv_img,axis=(0,1))
    if h == 0:
        gray_img[row] = 1
    h_list[row] = h
    s_list[row] = s
    v_list[row] = v
train_hsv_df['gray_img'] = gray_img
train_hsv_df['h'] = h_list
train_hsv_df['s'] = s_list
train_hsv_df['v'] = v_list
94239it [13:24, 116.30it/s]
In [18]:
train_hsv_df.to_csv('train_hsv_df.csv')
In [19]:
gray_df = train_hsv_df[train_hsv_df['gray_img']==1]
print(gray_df.shape)
(16972, 6)
In [20]:
gray_labels = np.zeros((len(gray_df),1103))
for row_index,row in enumerate(gray_df['attribute_ids']):
    for label in row.split(' '):
        gray_labels[row_index,int(label)] = 1
train_labels = np.zeros((len(train_hsv_df),1103))
for row_index,row in enumerate(train_hsv_df['attribute_ids']):
    for label in row.split(' '):
        train_labels[row_index,int(label)] = 1
In [21]:
gray_sums = gray_labels.sum(axis=0)
train_sums = train_labels.sum(axis=0)
percentage = gray_sums/train_sums

label_names = pd.read_csv('../input/labels.csv')
label_names = label_names['attribute_name']
new_img_df = pd.DataFrame(index=label_names)
new_img_df['gray_count'] = gray_sums
new_img_df['count'] = train_sums
new_img_df['percentage'] = percentage
new_img_df.head(5)
Out[21]:
gray_count count percentage
attribute_name
culture::abruzzi 5.0 18.0 0.277778
culture::achaemenid 16.0 100.0 0.160000
culture::aegean 0.0 14.0 0.000000
culture::afghan 0.0 3.0 0.000000
culture::after british 1.0 13.0 0.076923
In [22]:
new_img_df.to_csv('new_img_df.csv')
In [23]:
new_img_df.sort_values(['percentage'],ascending=False).iloc[:30]
Out[23]:
gray_count count percentage
attribute_name
culture::la rochelle 3.0 3.0 1.000000
culture::british or scottish 3.0 3.0 1.000000
culture::the hague 12.0 12.0 1.000000
culture::chinese with european decoration 1.0 1.0 1.000000
culture::for danish market 6.0 6.0 1.000000
culture::konigsberg 2.0 2.0 1.000000
culture::dehua 1.0 1.0 1.000000
culture::sinceny 4.0 4.0 1.000000
culture::skyros 1.0 1.0 1.000000
culture::beautiran 3.0 3.0 1.000000
culture::for continental market 18.0 19.0 0.947368
culture::umbria 17.0 18.0 0.944444
culture::etruria 37.0 40.0 0.925000
culture::for american market 29.0 32.0 0.906250
culture::chaumont-sur-loire 21.0 24.0 0.875000
culture::greek islands 48.0 55.0 0.872727
culture::augsburg decoration 11.0 13.0 0.846154
culture::potsdam 9.0 11.0 0.818182
culture::burslem 27.0 33.0 0.818182
culture::geneva 52.0 64.0 0.812500
culture::liverpool 4.0 5.0 0.800000
culture::provincial 4.0 5.0 0.800000
culture::for british market 75.0 94.0 0.797872
culture::bordeaux 7.0 9.0 0.777778
culture::cyclades 10.0 13.0 0.769231
culture::silesia 20.0 26.0 0.769231
culture::meissen with german 13.0 17.0 0.764706
culture::frankenthal 29.0 38.0 0.763158
culture::derby 19.0 25.0 0.760000
culture::sceaux 12.0 16.0 0.750000
In [24]:
new_img_df.sort_values(['gray_count','percentage'],ascending=False).iloc[:30]
Out[24]:
gray_count count percentage
attribute_name
culture::french 2792.0 13522.0 0.206478
tag::flowers 2721.0 8419.0 0.323198
culture::china 2342.0 5382.0 0.435154
tag::men 2146.0 19970.0 0.107461
tag::leaves 1639.0 5259.0 0.311656
tag::utilitarian objects 1609.0 6564.0 0.245125
tag::women 1430.0 14281.0 0.100133
culture::italian 1376.0 10375.0 0.132627
tag::textile fragments 1303.0 3570.0 0.364986
culture::german 1257.0 5163.0 0.243463
culture::american 1218.0 9151.0 0.133100
culture::japan 1157.0 7394.0 0.156478
culture::british 1145.0 7615.0 0.150361
culture::egyptian 941.0 6542.0 0.143840
tag::trees 853.0 5591.0 0.152567
tag::birds 798.0 3692.0 0.216143
tag::dishes 744.0 1789.0 0.415875
tag::inscriptions 740.0 3890.0 0.190231
culture::turkish or venice 707.0 4416.0 0.160100
tag::seals 697.0 1744.0 0.399656
tag::human figures 601.0 3665.0 0.163984
tag::profiles 577.0 2552.0 0.226097
tag::portraits 550.0 5955.0 0.092359
tag::bowls 541.0 2045.0 0.264548
tag::cups 519.0 1283.0 0.404521
tag::vases 467.0 1540.0 0.303247
tag::animals 406.0 2548.0 0.159341
culture::paris 386.0 810.0 0.476543
tag::bottles 378.0 1685.0 0.224332
culture::dutch 378.0 1762.0 0.214529
In [25]:
markets = []
markets_index = []
for i,name in enumerate(label_names):
    if name.endswith('market'):
        print(name)
        markets.append(name)
        markets_index.append(i)
culture::for american market
culture::for british market
culture::for continental market
culture::for danish market
culture::for european market
culture::for french market
culture::for iberian market
culture::for portuguese market
culture::for russian market
culture::for swedish market
In [26]:
new_img_df.loc[markets]
Out[26]:
gray_count count percentage
attribute_name
culture::for american market 29.0 32.0 0.906250
culture::for british market 75.0 94.0 0.797872
culture::for continental market 18.0 19.0 0.947368
culture::for danish market 6.0 6.0 1.000000
culture::for european market 44.0 93.0 0.473118
culture::for french market 6.0 14.0 0.428571
culture::for iberian market 1.0 3.0 0.333333
culture::for portuguese market 21.0 29.0 0.724138
culture::for russian market 0.0 2.0 0.000000
culture::for swedish market 6.0 10.0 0.600000
In [27]:
markets_index
Out[27]:
[134, 135, 136, 137, 138, 139, 140, 141, 142, 143]
In [28]:
for i in np.random.choice(np.where(train_labels[:,134]==1)[0],3):
    img_path = '../input/train/' + train_hsv_df.iloc[i]['id'] + '.png'
    img = Image.open(img_path)
    plt.imshow(img)
    plt.show()
    print([label_names[int(i)] for i in train_hsv_df.iloc[i]['attribute_ids'].split(' ')])
['culture::for american market', 'tag::animals', 'tag::bowls', 'tag::men', 'culture::turkish or venice']
['culture::for american market', 'tag::architecture', 'tag::butterflies', 'tag::coverlets and quilts', 'tag::flowers', 'culture::turkish or venice']
['culture::for american market', 'tag::birds', 'tag::butterflies', 'tag::dishes', 'tag::flowers', 'culture::turkish or venice']
In [29]:
for i in np.random.choice(np.where(train_labels[:,135]==1)[0],3):
    img_path = '../input/train/' + train_hsv_df.iloc[i]['id'] + '.png'
    img = Image.open(img_path)
    plt.imshow(img)
    plt.show()
    print([label_names[int(i)] for i in train_hsv_df.iloc[i]['attribute_ids'].split(' ')])
['culture::for british market', 'tag::utilitarian objects', 'culture::turkish or venice']
['culture::for british market', 'tag::bowls', 'tag::men', 'tag::musical instruments', 'tag::musicians', 'culture::turkish or venice']
['culture::for british market', 'tag::dishes', 'tag::insignia', 'culture::turkish or venice']
In [30]:
for i in np.random.choice(np.where(train_labels[:,142]==1)[0],2):
    img_path = '../input/train/' + train_hsv_df.iloc[i]['id'] + '.png'
    img = Image.open(img_path)
    plt.imshow(img)
    plt.show()
    print([label_names[int(i)] for i in train_hsv_df.iloc[i]['attribute_ids'].split(' ')])
['culture::french', 'culture::for russian market', 'tag::clothing and accessories', 'tag::human figures', 'tag::plants']
['culture::french', 'culture::for russian market', 'tag::clothing and accessories', 'tag::human figures', 'tag::plants']
In [31]:
def mask2(img_path):
    image=cv2.imread(img_path)
    img=cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    hsv_image = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hsv_image2 = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
    h,s,v = cv2.split(hsv_image)
    lower_blue=np.array([20,30,0])
    upper_blue=np.array([160,255,255])
    lower_blue2=np.array([0,30,0])
    upper_blue2=np.array([255,255,255])
    mask=cv2.inRange(hsv_image,lower_blue,upper_blue)
    mask2=cv2.inRange(hsv_image2,lower_blue2,upper_blue2)
    use_mask = False
    if np.average(h)<20 or np.average(s)<30 or np.average(v)<140:
        res = img
        res2 = cv2.bitwise_and(img,img,mask=mask)
        res3 = cv2.bitwise_and(img,img,mask=mask2)
        res4 = (res2+res3)//2
        res4[np.where(res4>255)]=255
    else:
        res = cv2.bitwise_and(img,img,mask=mask)
        res2 = cv2.bitwise_and(img,img,mask=mask)
        res3 = cv2.bitwise_and(img,img,mask=mask2)
        res4 = (res2+res3)//2
        res4[np.where(res4>255)]=255
        use_mask = True
    plt.subplot(171),plt.imshow(img),plt.title('ORIGINAL')
    plt.subplot(172),plt.imshow(mask),plt.title('Mask1')
    plt.subplot(173),plt.imshow(mask2),plt.title('Mask2')
    plt.subplot(174),plt.imshow(res),plt.title('use_mask' if use_mask else 'original')
    plt.subplot(175),plt.imshow(res2),plt.title('use_mask_BGR')
    plt.subplot(176),plt.imshow(res3),plt.title('use_mask_RGB')
    plt.subplot(177),plt.imshow(res4),plt.title('res4')
    plt.show()
    h,s,v=np.average(hsv_image,axis=(0,1))
    print(h, s, v)
    h2,s2,v2=np.average(hsv_image2,axis=(0,1))
    print(h2, s2, v2)
    if h == 0 and s == 0:
        print(np.average(img,axis=(0,1)))
In [32]:
for i in np.random.choice(range(len(train_hsv_df)),3):
    plt.figure(figsize=(20,20))
    img_id = train_hsv_df.iloc[i]['id']
    path = '../input/train/'+img_id+'.png'
    mask2(path)
41.493525 30.394083333333334 130.65836666666667
19.304175 30.394083333333334 130.65836666666667
101.27489560835133 53.808711303095755 191.0398128149748
18.73585313174946 53.808711303095755 191.0398128149748
94.54458020050126 55.49302631578947 188.8110902255639
25.5037343358396 55.49302631578947 188.8110902255639