From: https://www.kaggle.com/jshen97/label-exploration-and-clustering
Author: Jason
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input/googlenewsvectorsnegative300"))
# Any results you write to the current directory are saved as output.
['GoogleNews-vectors-negative300.bin.gz']
pd.options.display.max_rows = 64
pd.options.display.max_columns = 512
label = pd.read_csv('../input/imet-2019-fgvc6/labels.csv')
label_list = list(label.attribute_name.str.split(pat='::').map(lambda x: x[1]))
label_dict = dict(zip(label_list, list(label.attribute_id)))
data = pd.read_csv('../input/imet-2019-fgvc6/train.csv')
from gensim import models
/opt/conda/lib/python3.6/site-packages/smart_open/ssh.py:34: UserWarning: paramiko missing, opening SSH/SCP/SFTP paths will be disabled. `pip install paramiko` to suppress warnings.warn('paramiko missing, opening SSH/SCP/SFTP paths will be disabled. `pip install paramiko` to suppress')
w2v = models.KeyedVectors.load_word2vec_format('../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin.gz', binary=True)
map_label = {}
exceptions = []
for item in label_list:
try:
map_label[item] = w2v[item]
except BaseException:
try:
item_C = item[0].upper() + item[1:]
map_label[item] = w2v[item_C]
except BaseException:
try:
item__ = item.replace(' ','_')
map_label[item] = w2v[item__]
except BaseException:
exceptions.append(item)
print(len(exceptions),len(map_label))
231 872
exceptions
['after british', 'after german', 'after german original', 'after italian', 'after russian original', 'alexandria-hadra', 'american or european', 'asia minor', 'atlantic watershed', 'augsburg decoration', 'augsburg original', 'babylonian or kassite', 'bactria-margiana archaeological complex', 'beautiran', 'boeotian', 'british or french', 'british or scottish', 'canosan', 'castel durante', 'central asia', 'central european', 'central italian', 'chalcidian', 'chaumont-sur-loire', 'chelsea-derby', 'chinese with dutch decoration', 'chinese with european decoration', 'chinese with french mounts', 'chorrera', 'chupicuaro', 'colonial american', 'coromandel coast', 'costa rica or panama', 'cypriot or phoenician', 'east greek', 'east greek/sardis', 'eastern european', 'eastern mediterranean', 'eastern mediterranean or italian', 'edomite', 'euboean', 'european bronze age', 'faliscan', 'flemish or italian', 'for american market', 'for british market', 'for continental market', 'for danish market', 'for european market', 'for french market', 'for iberian market', 'for portuguese market', 'for russian market', 'for swedish market', 'freiburg im breisgau', 'french or german', 'french or italian', 'french or swiss', 'german or swiss', 'ghassulian', 'gnathian', 'greek islands', 'greek or roman', 'guanacaste-nicoya', 'helladic', 'hochst', 'huastec', 'indian or nepalese', 'isin-larsa', 'isin-larsaold babylonian', 'italian or sicilian', 'italian or spanish', 'italic-native', 'jouy-en-josas', 'kathmandu valley', 'kholmogory', "kievan rus'", 'la rochelle', 'laconian', 'langobardic', 'london original', 'longton hall', 'macaracas', 'meissen with german', 'mennecy', 'mennecy or sceaux', 'mezcala', 'moche-wari', 'montelupo', 'moustiers', 'muisca', 'neo-sumerian', 'neuwied am rhein', 'north china', 'north indian', 'north italian', 'north netherlandish', 'northern european', 'northern india', 'northern italian', 'northwest china', 'northwest china/eastern central asia', 'old assyrian trading colony', 'ottonian', 'parthian or sasanian', 'populonia', 'praenestine', 'proto-elamite', 'remojadas', 'roman egyptian', 'saint-cloud', 'salinar', 'san sabastian', 'sinceny', 'south german', 'south italian', 'south netherlandish', 'southern german', 'st. petersburg', 'stoke-on-trent', 'tairona', 'tarentine', 'teano', 'the hague', 'tlatilco', 'tolita-tumaco', 'topara', 'turkish or venice', 'united states', 'urartian', 'urbino with gubbio luster', 'vulci', 'west slavic', 'western european', 'zenu', 'adoration of the magi', 'adoration of the sheperds', 'air transports', 'alexander the great', 'ancient greek', 'archangel gabriel', 'architectural elements', 'architectural fragments', 'assumption of the virgin', 'baptism of christ', 'benjamin franklin', 'bodies of water', 'body parts', 'bow and arrow', 'buddhist religious figures', 'buildings and structures', 'carpets and rugs', 'christian imagery', 'civil war', 'clothing and accessories', 'coat of arms', 'coverlets and quilts', 'daily life', 'decorative designs', 'decorative elements', 'descent from the cross', 'design elements', 'drinking glasses', 'egg and dart', 'emperor augustus', 'gadrooning', 'genre scene', 'gingham pattern', 'greek deities', 'greek figures', 'hindu religious figures', 'historical figures', 'holy family', 'horse riding', 'human figures', 'julius caesar', 'last judgement', 'last supper', 'liturgical objects', 'living rooms', 'louis xiv', 'madonna and child', 'mark antony', 'mary magdalene', 'military clothing', 'military equipment', 'napoleon i', 'new testament', 'nonrepresentational art', 'old testament', 'palmettes', 'playing cards', 'pocket watches', 'polka-dot pattern', 'religious events', 'religious texts', 'roman deities', 'saint anne', 'saint anthony', 'saint catherine', 'saint francis', 'saint george', 'saint jerome', 'saint john the baptist', 'saint john the evangelist', 'saint joseph', 'saint lawrence', 'saint mark', 'saint matthew', 'saint michael', 'saint paul', 'saint peter', 'seating furniture', 'self-portraits', 'still life', 'storage furniture', 'strapwork', 'street scene', 'sword guards', 'taweret', 'tea caddy', 'tea drinking', 'textile fragments', 'tools and equipment', 'tricorns', 'vajrapani', 'vase fragments', 'weights and measures', 'world war i', 'writing implements', 'writing systems']
np.zeros(2)
array([0., 0.])
no_embed = np.zeros(len(exceptions))
no_embeds = []
for item in exceptions:
no_embeds.append(label_dict[item])
for item in data.attribute_ids:
for num in item.split(' '):
num = int(num)
for i in range(len(exceptions)):
if num == no_embeds[i]:
no_embed[i] += 1
no_embed.astype(np.int32)
array([ 13, 17, 4, 3, 14, 10, 408, 22, 41, 13, 12, 4, 119, 3, 35, 17, 3, 13, 13, 43, 831, 12, 7, 24, 14, 5, 1, 6, 8, 16, 19, 4, 3, 3, 60, 58, 5, 127, 17, 7, 6, 7, 4, 4, 32, 94, 19, 6, 93, 14, 3, 29, 2, 10, 1, 12, 37, 22, 14, 36, 7, 55, 53, 10, 21, 52, 8, 6, 6, 2, 3, 69, 9, 44, 7, 1, 9, 3, 116, 5, 13, 4, 10, 17, 21, 1, 71, 2, 7, 12, 12, 48, 4, 43, 6, 30, 17, 14, 5, 56, 28, 2, 71, 4, 6, 1, 9, 5, 10, 9, 40, 10, 2, 4, 13, 293, 219, 81, 63, 22, 21, 60, 13, 12, 19, 7, 5, 4416, 3, 22, 3, 2, 6, 16, 8, 91, 71, 36, 58, 21, 35, 254, 888, 34, 24, 26, 1472, 40, 253, 28, 102, 205, 866, 56, 2180, 1390, 39, 75, 78, 1428, 21, 440, 186, 39, 26, 38, 22, 12, 55, 257, 3, 36, 160, 1895, 3665, 34, 34, 46, 234, 4, 36, 637, 1, 111, 40, 233, 27, 7, 37, 9, 215, 141, 204, 7, 12, 10, 37, 51, 36, 81, 54, 41, 96, 329, 87, 79, 20, 18, 39, 38, 113, 119, 67, 97, 67, 55, 68, 23, 303, 42, 64, 20, 3570, 2001, 13, 5, 744, 228, 12, 487, 2327], dtype=int32)
count = 0
for item in data.attribute_ids:
for num in item.split(' '):
num = int(num)
count += 1
count
346618
df_labels = pd.DataFrame(map_label).T
df_labels.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
abruzzi | 0.079590 | -0.049316 | -0.026733 | 0.181641 | 0.253906 | 0.291016 | -0.069824 | -0.145508 | 0.058105 | -0.148438 | 0.259766 | 0.083984 | 0.335938 | -0.046387 | 0.151367 | 0.414062 | -0.189453 | 0.127930 | 0.289062 | 0.026367 | -0.066895 | 0.273438 | 0.263672 | 0.083008 | 0.019287 | -0.103516 | -0.177734 | 0.193359 | -0.163086 | -0.212891 | 0.363281 | -0.120605 | 0.215820 | 0.324219 | 0.087891 | -0.371094 | 0.359375 | -0.055420 | -0.226562 | -0.170898 | 0.355469 | 0.159180 | -0.045166 | 0.007019 | 0.002640 | 0.012390 | 0.064941 | 0.333984 | 0.283203 | -0.091309 | -0.246094 | 0.040039 | -0.023682 | -0.223633 | 0.171875 | -0.031738 | -0.253906 | -0.091797 | -0.086914 | 0.136719 | -0.242188 | 0.084961 | 0.250000 | -0.162109 | 0.124512 | -0.271484 | -0.036133 | -0.488281 | 0.094727 | 0.082520 | 0.046143 | -0.191406 | -0.204102 | 0.011292 | -0.056152 | -0.213867 | -0.040039 | -0.257812 | 0.005676 | -0.496094 | 0.178711 | 0.056641 | -0.203125 | -0.047119 | -0.016602 | -0.084961 | -0.197266 | 0.127930 | 0.187500 | -0.032959 | -0.246094 | -0.488281 | -0.059570 | 0.162109 | -0.173828 | 0.041992 | -0.088867 | -0.316406 | -0.238281 | -0.289062 | -0.691406 | -0.341797 | 0.144531 | -0.112305 | 0.155273 | 0.240234 | -0.593750 | -0.152344 | -0.136719 | 0.248047 | -0.115723 | 0.091309 | 0.671875 | -0.316406 | -0.112793 | 0.031738 | 0.220703 | -0.047852 | 0.232422 | 0.458984 | -0.104980 | 0.019165 | -0.186523 | -0.151367 | 0.384766 | -0.289062 | -0.402344 | -0.019287 | -0.287109 | 0.523438 | -0.566406 | -0.050293 | 0.117676 | -0.178711 | -0.151367 | 0.075684 | -0.125000 | -0.070312 | 0.172852 | 0.243164 | 0.617188 | 0.020020 | -0.198242 | 0.261719 | 0.326172 | 0.025024 | 0.109375 | 0.222656 | -0.158203 | -0.570312 | 0.070801 | -0.110840 | -0.124023 | 0.235352 | -0.066895 | 0.199219 | 0.188477 | 0.660156 | -0.162109 | 0.056396 | -0.281250 | -0.222656 | -0.277344 | 0.010315 | 0.080566 | -0.178711 | 0.253906 | 0.355469 | -0.064453 | -0.066895 | -0.369141 | -0.107422 | -0.013000 | -0.092773 | 0.031494 | 0.099121 | 0.015015 | 0.203125 | 0.243164 | -0.289062 | -0.190430 | -0.041016 | -0.090332 | 0.302734 | 0.310547 | -0.058594 | -0.062012 | 0.035889 | -0.049805 | -0.275391 | 0.124023 | -0.032471 | 0.216797 | 0.077148 | 0.112793 | 0.164062 | 0.124023 | -0.096680 | 0.134766 | -0.132812 | 0.289062 | 0.353516 | -0.304688 | 0.126953 | 0.063965 | -0.306641 | 0.206055 | 0.008301 | 0.014038 | 0.101562 | -0.039307 | -0.308594 | -0.036133 | 0.034424 | -0.365234 | 0.078125 | 0.246094 | 0.016602 | -0.190430 | 0.125977 | 0.055420 | -0.259766 | -0.063477 | -0.159180 | 0.001854 | 0.152344 | -0.068359 | 0.373047 | -0.316406 | 0.044434 | -0.101562 | -0.089355 | 0.125977 | 0.043213 | -0.019165 | 0.020386 | -0.213867 | 0.153320 | 0.197266 | -0.032959 | -0.026489 | -0.206055 | 0.044922 | 0.287109 | -0.112305 | -0.166016 | 0.361328 | 0.088379 | 0.117676 | 0.253906 | -0.155273 | 0.085449 | -0.125000 | -0.169922 | 0.589844 | -0.045166 | -0.157227 | -0.058105 | -0.140625 | -0.141602 | -0.130859 | 0.097656 | 0.218750 | 0.373047 | -0.242188 | -0.122070 | -0.209961 | -0.063477 | 0.163086 | -0.275391 | 0.007568 | -0.060547 | -0.230469 | 0.414062 | 0.425781 | 0.033691 | 0.022583 | -0.341797 | -0.172852 | 0.117188 | 0.244141 | 0.137695 | 0.172852 | -0.174805 | -0.018433 | -0.166992 | -0.458984 | 0.047852 | -0.406250 | -0.566406 | -0.048828 | 0.007751 | -0.421875 | 0.390625 | 0.231445 | 0.011047 | -0.104492 | 0.161133 | 0.130859 | 0.291016 |
achaemenid | 0.330078 | 0.523438 | -0.248047 | 0.281250 | 0.519531 | 0.291016 | 0.192383 | -0.484375 | -0.015747 | 0.261719 | -0.239258 | -0.339844 | -0.363281 | 0.402344 | -0.330078 | -0.292969 | -0.511719 | -0.359375 | -0.178711 | 0.075684 | -0.267578 | -0.382812 | -0.106445 | 0.139648 | 0.063477 | 0.064453 | -0.244141 | 0.024414 | -0.386719 | -0.205078 | 0.036865 | -0.077148 | 0.107422 | 0.185547 | -0.012268 | -0.351562 | -0.067383 | -0.294922 | -0.032227 | -0.189453 | 0.237305 | 0.265625 | 0.253906 | -0.156250 | 0.396484 | -0.367188 | 0.028320 | 0.133789 | -0.250000 | -0.169922 | 0.098145 | -0.028198 | -0.181641 | -0.091797 | -0.292969 | 0.402344 | -0.129883 | 0.294922 | -0.147461 | -0.437500 | -0.028564 | -0.021606 | -0.171875 | 0.046875 | -0.322266 | -0.324219 | 0.257812 | -0.142578 | 0.077148 | -0.236328 | -0.200195 | 0.103027 | -0.503906 | -0.084473 | -0.153320 | -0.113770 | 0.155273 | 0.128906 | 0.156250 | -0.050049 | -0.008911 | -0.287109 | 0.001022 | -0.228516 | 0.032227 | 0.167969 | -0.051025 | -0.199219 | -0.162109 | 0.326172 | -0.201172 | -0.687500 | -0.227539 | 0.086426 | 0.406250 | -0.085449 | -0.083496 | 0.296875 | 0.049561 | -0.016968 | -0.074219 | -0.130859 | 0.060059 | 0.017578 | -0.049316 | -0.012390 | -0.406250 | 0.267578 | 0.261719 | 0.106445 | 0.063965 | 0.226562 | 0.140625 | -0.105469 | 0.209961 | -0.367188 | -0.137695 | -0.031128 | 0.058105 | 0.066406 | 0.213867 | -0.123535 | 0.110352 | 0.253906 | 0.435547 | 0.128906 | 0.247070 | -0.324219 | -0.570312 | 0.221680 | -0.138672 | 0.130859 | -0.480469 | 0.310547 | -0.190430 | -0.236328 | -0.326172 | -0.131836 | -0.032227 | 0.464844 | 0.140625 | 0.105469 | -0.081055 | 0.147461 | 0.357422 | -0.283203 | -0.058105 | 0.053711 | -0.322266 | -0.235352 | 0.191406 | -0.441406 | -0.224609 | 0.033203 | 0.145508 | -0.033691 | -0.341797 | 0.019653 | 0.289062 | 0.053467 | -0.125977 | 0.167969 | -0.228516 | -0.421875 | 0.181641 | -0.096191 | 0.269531 | 0.324219 | -0.120117 | 0.378906 | -0.191406 | -0.188477 | 0.043457 | -0.263672 | 0.539062 | -0.373047 | 0.090332 | -0.265625 | 0.133789 | -0.353516 | -0.103027 | 0.054688 | -0.240234 | -0.169922 | -0.275391 | 0.180664 | 0.345703 | 0.015747 | 0.043213 | 0.153320 | -0.163086 | 0.291016 | 0.193359 | 0.105957 | 0.026855 | -0.175781 | 0.617188 | 0.025513 | 0.103027 | -0.109375 | 0.130859 | 0.031494 | -0.298828 | -0.224609 | 0.031250 | 0.308594 | 0.192383 | -0.019409 | 0.052734 | 0.273438 | -0.094727 | -0.168945 | -0.180664 | -0.453125 | 0.007233 | 0.194336 | -0.093750 | -0.170898 | 0.318359 | -0.369141 | -0.032471 | -0.008240 | -0.365234 | 0.013855 | -0.022583 | -0.050293 | -0.267578 | 0.202148 | -0.155273 | 0.283203 | -0.014343 | 0.078613 | -0.384766 | 0.159180 | -0.011353 | -0.174805 | -0.066406 | 0.193359 | 0.367188 | 0.102051 | 0.394531 | -0.096191 | 0.375000 | -0.482422 | -0.136719 | 0.182617 | -0.519531 | 0.209961 | -0.263672 | -0.253906 | 0.120605 | -0.225586 | -0.200195 | 0.183594 | 0.084961 | -0.059570 | -0.507812 | -0.341797 | 0.035400 | -0.149414 | 0.410156 | -0.023804 | -0.073242 | -0.210938 | -0.380859 | 0.371094 | 0.141602 | -0.582031 | 0.120605 | -0.273438 | -0.367188 | 0.109863 | 0.154297 | 0.096680 | 0.750000 | -0.037109 | 0.347656 | -0.285156 | -0.116699 | 0.015564 | 0.062988 | 0.431641 | 0.283203 | -0.003464 | 0.259766 | -0.071289 | -0.558594 | 0.283203 | 0.062988 | 0.011169 | -0.478516 | -0.208008 | 0.093750 | 0.046387 | -0.006897 | 0.099609 | -0.435547 | 0.125977 | 0.347656 | -0.054199 |
aegean | 0.285156 | -0.046631 | -0.040771 | -0.057129 | 0.163086 | -0.178711 | 0.044922 | -0.112305 | -0.128906 | 0.285156 | -0.210938 | -0.062256 | 0.054443 | -0.095703 | -0.052734 | 0.100098 | -0.562500 | -0.216797 | 0.306641 | -0.061523 | 0.102051 | -0.345703 | -0.032715 | -0.188477 | 0.114258 | 0.443359 | -0.026367 | 0.367188 | -0.427734 | 0.068848 | -0.026855 | -0.271484 | -0.074219 | 0.215820 | 0.103027 | 0.072266 | -0.203125 | 0.030029 | 0.441406 | -0.285156 | 0.044434 | 0.069824 | -0.128906 | -0.150391 | -0.106445 | -0.066895 | -0.128906 | -0.367188 | -0.015747 | -0.145508 | 0.070801 | 0.221680 | 0.124512 | -0.097656 | -0.343750 | 0.171875 | -0.225586 | 0.257812 | -0.114258 | -0.289062 | -0.223633 | -0.078125 | 0.151367 | 0.213867 | 0.207031 | 0.048340 | 0.273438 | -0.125000 | -0.171875 | -0.074707 | -0.283203 | -0.096191 | 0.033691 | -0.007172 | -0.143555 | -0.494141 | 0.109863 | -0.433594 | -0.018066 | 0.052246 | -0.097168 | -0.217773 | -0.617188 | -0.217773 | 0.062988 | 0.168945 | -0.271484 | 0.089355 | 0.010010 | 0.324219 | -0.038574 | -0.562500 | -0.394531 | 0.138672 | -0.390625 | 0.049561 | -0.050537 | -0.003601 | 0.320312 | 0.012695 | -0.193359 | -0.265625 | 0.104492 | -0.085449 | -0.064453 | 0.033691 | 0.044678 | -0.265625 | -0.084473 | 0.109863 | -0.179688 | 0.137695 | 0.343750 | -0.112305 | -0.123047 | -0.289062 | -0.096680 | 0.072754 | 0.126953 | 0.102539 | -0.130859 | 0.115234 | 0.139648 | -0.054443 | -0.134766 | -0.188477 | -0.228516 | 0.089844 | -0.515625 | 0.312500 | -0.243164 | 0.114258 | -0.279297 | 0.535156 | -0.257812 | -0.333984 | -0.189453 | -0.197266 | 0.277344 | -0.123047 | 0.196289 | -0.251953 | -0.166992 | 0.189453 | 0.016479 | -0.055664 | -0.312500 | 0.221680 | -0.054443 | 0.026978 | 0.008362 | 0.139648 | 0.000012 | -0.113770 | 0.135742 | -0.041016 | 0.067871 | -0.117188 | -0.028687 | 0.103516 | -0.082520 | -0.003265 | 0.039551 | -0.126953 | 0.229492 | 0.316406 | -0.215820 | 0.070312 | -0.149414 | 0.355469 | -0.300781 | -0.045166 | 0.250000 | -0.200195 | -0.056152 | -0.194336 | -0.081543 | -0.263672 | -0.141602 | -0.412109 | 0.001053 | 0.121094 | -0.066406 | 0.167969 | -0.156250 | -0.089844 | 0.075195 | -0.257812 | 0.229492 | -0.055908 | 0.027588 | -0.015991 | 0.025879 | 0.460938 | -0.030273 | 0.410156 | 0.054688 | 0.150391 | 0.216797 | 0.078613 | 0.037842 | 0.168945 | -0.308594 | -0.357422 | 0.020264 | -0.035645 | 0.333984 | -0.275391 | 0.007263 | 0.149414 | -0.261719 | 0.042725 | -0.164062 | 0.527344 | -0.096680 | 0.069824 | 0.215820 | 0.014832 | 0.038574 | -0.157227 | 0.049072 | 0.092285 | -0.113770 | 0.154297 | -0.131836 | -0.131836 | -0.113770 | 0.055664 | 0.125000 | -0.083008 | 0.230469 | -0.220703 | -0.125000 | -0.170898 | 0.190430 | 0.384766 | 0.085449 | -0.126953 | 0.209961 | -0.095215 | -0.045410 | -0.196289 | -0.164062 | 0.040039 | 0.166016 | -0.094727 | 0.110352 | 0.124512 | -0.102539 | -0.197266 | -0.042480 | -0.091797 | 0.151367 | 0.016235 | 0.141602 | -0.102539 | 0.076660 | 0.126953 | 0.251953 | 0.092773 | 0.182617 | 0.091309 | -0.492188 | 0.155273 | -0.135742 | -0.353516 | -0.220703 | -0.191406 | -0.057861 | 0.241211 | -0.127930 | 0.225586 | -0.061768 | 0.443359 | 0.053467 | 0.198242 | -0.218750 | -0.386719 | -0.156250 | 0.080566 | -0.247070 | 0.597656 | 0.149414 | -0.246094 | 0.082031 | 0.150391 | -0.193359 | 0.027832 | -0.291016 | 0.328125 | 0.130859 | -0.166016 | -0.308594 | 0.231445 | 0.242188 | 0.195312 | -0.116211 | 0.046875 | 0.248047 | 0.129883 |
afghan | -0.220703 | -0.226562 | -0.128906 | 0.261719 | 0.091309 | -0.052490 | 0.375000 | -0.349609 | -0.351562 | 0.080078 | -0.125977 | -0.496094 | -0.130859 | -0.175781 | -0.330078 | -0.164062 | -0.234375 | -0.022583 | -0.050049 | -0.365234 | -0.098145 | 0.020264 | 0.247070 | -0.086426 | 0.085449 | 0.128906 | -0.326172 | -0.004883 | -0.239258 | -0.429688 | -0.080078 | -0.015991 | -0.349609 | -0.070312 | -0.169922 | -0.034424 | 0.082031 | 0.062988 | 0.045166 | -0.065430 | -0.218750 | 0.542969 | 0.316406 | 0.091797 | 0.160156 | -0.060791 | -0.105957 | 0.065430 | 0.154297 | -0.090332 | -0.045898 | -0.005066 | 0.094238 | -0.079590 | -0.507812 | 0.018677 | -0.019775 | 0.093750 | -0.065918 | -0.011108 | -0.292969 | -0.015625 | -0.294922 | 0.193359 | 0.050293 | 0.173828 | -0.441406 | -0.139648 | -0.241211 | 0.218750 | 0.255859 | 0.066895 | -0.433594 | -0.384766 | -0.021362 | -0.030762 | 0.228516 | -0.202148 | 0.251953 | 0.103027 | -0.071289 | -0.314453 | -0.101074 | 0.156250 | -0.219727 | -0.173828 | 0.154297 | 0.228516 | 0.279297 | 0.033447 | -0.153320 | -0.073730 | -0.061523 | -0.150391 | -0.114258 | 0.175781 | 0.193359 | -0.277344 | 0.318359 | -0.412109 | -0.167969 | 0.097168 | 0.320312 | -0.176758 | -0.235352 | 0.298828 | -0.073730 | 0.003281 | -0.176758 | 0.145508 | 0.013977 | -0.265625 | 0.341797 | -0.054932 | 0.171875 | -0.159180 | -0.037598 | 0.157227 | 0.166016 | 0.102539 | -0.009216 | 0.043701 | 0.291016 | -0.205078 | 0.029663 | 0.150391 | -0.057617 | -0.074707 | -0.162109 | 0.406250 | 0.159180 | 0.174805 | 0.070312 | 0.351562 | 0.076172 | 0.116699 | -0.353516 | -0.353516 | 0.417969 | 0.019775 | -0.014160 | 0.168945 | 0.070312 | -0.052734 | -0.139648 | -0.029297 | -0.199219 | -0.096680 | 0.055664 | -0.263672 | 0.376953 | -0.065430 | 0.028076 | 0.120117 | 0.095703 | -0.498047 | -0.123535 | 0.099609 | -0.294922 | 0.292969 | -0.185547 | -0.207031 | 0.239258 | -0.263672 | -0.112305 | 0.114746 | 0.242188 | -0.197266 | 0.178711 | 0.122559 | 0.205078 | -0.059082 | -0.335938 | -0.291016 | -0.116699 | -0.046143 | 0.141602 | -0.185547 | 0.257812 | 0.063477 | -0.035400 | -0.245117 | 0.031738 | 0.345703 | 0.003189 | -0.291016 | 0.035889 | 0.083008 | -0.096191 | 0.306641 | 0.296875 | 0.083984 | 0.011780 | 0.102539 | 0.182617 | -0.034668 | 0.086914 | 0.180664 | -0.298828 | -0.091309 | -0.511719 | -0.065918 | 0.226562 | -0.050781 | -0.029785 | 0.007812 | 0.076660 | 0.165039 | 0.183594 | 0.318359 | -0.429688 | 0.033691 | -0.558594 | -0.150391 | 0.554688 | -0.020630 | 0.028442 | 0.013672 | -0.195312 | 0.057617 | -0.221680 | 0.099121 | 0.138672 | 0.151367 | -0.271484 | -0.095703 | 0.267578 | 0.101074 | 0.390625 | -0.365234 | 0.027344 | -0.071777 | -0.189453 | 0.054443 | -0.068848 | 0.007996 | 0.233398 | 0.033203 | 0.182617 | 0.110840 | -0.214844 | 0.376953 | 0.417969 | -0.123047 | -0.177734 | 0.275391 | 0.125977 | -0.330078 | -0.104004 | -0.197266 | -0.279297 | -0.133789 | 0.127930 | 0.267578 | 0.085449 | -0.000969 | -0.099609 | 0.085938 | 0.128906 | 0.007782 | -0.205078 | 0.121094 | 0.023682 | -0.324219 | -0.185547 | 0.105957 | 0.050537 | 0.079102 | -0.253906 | -0.048096 | -0.164062 | 0.333984 | -0.224609 | 0.050781 | 0.412109 | 0.093750 | 0.472656 | -0.324219 | -0.241211 | 0.131836 | 0.296875 | -0.078125 | 0.165039 | 0.207031 | -0.159180 | -0.024536 | -0.055908 | 0.015625 | 0.359375 | 0.126953 | -0.267578 | -0.010010 | -0.066895 | -0.173828 | 0.035156 | -0.156250 | 0.012634 | 0.216797 | 0.102539 | 0.341797 |
akkadian | 0.220703 | 0.250000 | -0.143555 | 0.119141 | 0.106934 | 0.161133 | 0.283203 | -0.137695 | -0.097656 | 0.259766 | -0.040283 | -0.181641 | -0.257812 | 0.134766 | -0.203125 | -0.259766 | -0.216797 | -0.236328 | 0.125977 | -0.081543 | -0.202148 | -0.062988 | 0.239258 | -0.070801 | -0.131836 | -0.016968 | 0.080078 | 0.192383 | 0.025635 | -0.275391 | 0.099609 | -0.201172 | -0.108398 | 0.165039 | 0.061523 | -0.049316 | -0.016846 | 0.034668 | 0.051514 | -0.184570 | 0.178711 | 0.163086 | 0.292969 | 0.229492 | 0.002869 | 0.022095 | -0.076660 | -0.078125 | -0.292969 | -0.193359 | -0.357422 | -0.098633 | -0.077148 | -0.233398 | -0.015625 | 0.455078 | -0.159180 | -0.001320 | -0.132812 | -0.279297 | 0.152344 | 0.153320 | 0.251953 | 0.098145 | -0.271484 | -0.247070 | -0.161133 | 0.047363 | -0.085938 | -0.220703 | 0.032471 | -0.030151 | -0.265625 | -0.042236 | -0.139648 | -0.103516 | 0.013550 | 0.000633 | 0.148438 | -0.063477 | 0.158203 | -0.175781 | -0.253906 | -0.234375 | 0.072754 | 0.292969 | -0.044189 | 0.020508 | 0.160156 | 0.093750 | 0.062988 | -0.162109 | -0.039062 | 0.169922 | 0.014099 | -0.110352 | -0.058105 | 0.087891 | -0.041504 | -0.129883 | -0.098145 | -0.196289 | 0.122559 | -0.320312 | 0.078125 | -0.121094 | -0.400391 | 0.009949 | 0.153320 | 0.216797 | 0.010925 | 0.158203 | 0.316406 | -0.263672 | 0.223633 | 0.043213 | -0.145508 | -0.176758 | 0.047607 | -0.048828 | -0.029053 | 0.150391 | 0.173828 | 0.095703 | -0.099121 | -0.170898 | -0.149414 | -0.122070 | -0.417969 | 0.425781 | -0.155273 | 0.053223 | -0.375000 | 0.069824 | -0.062500 | 0.138672 | -0.075195 | 0.014465 | -0.130859 | 0.316406 | 0.011780 | 0.207031 | 0.043945 | 0.044434 | 0.214844 | 0.113770 | 0.067383 | 0.146484 | -0.053467 | -0.219727 | 0.207031 | -0.251953 | -0.116211 | 0.214844 | 0.053223 | -0.025269 | -0.332031 | 0.300781 | 0.472656 | -0.201172 | -0.180664 | -0.057861 | 0.121094 | -0.292969 | -0.114746 | 0.102539 | 0.172852 | 0.035889 | -0.211914 | 0.408203 | -0.172852 | -0.007782 | 0.215820 | 0.011536 | 0.494141 | -0.081055 | 0.101562 | -0.102051 | 0.131836 | -0.255859 | -0.034668 | 0.006775 | -0.298828 | 0.002869 | -0.102539 | 0.181641 | 0.162109 | 0.070801 | 0.097656 | 0.173828 | 0.085938 | 0.016968 | -0.273438 | 0.166016 | 0.084961 | 0.047607 | 0.269531 | -0.073730 | 0.047852 | -0.073242 | 0.242188 | -0.094238 | -0.177734 | 0.091797 | 0.068359 | 0.003937 | 0.012146 | -0.137695 | -0.296875 | 0.013916 | -0.036377 | 0.031006 | 0.061279 | -0.322266 | -0.168945 | 0.421875 | -0.015869 | -0.144531 | 0.015869 | -0.103027 | -0.118164 | 0.034180 | -0.140625 | 0.095703 | -0.072754 | -0.106445 | -0.277344 | 0.125977 | -0.016602 | -0.073730 | -0.043457 | -0.211914 | -0.318359 | 0.217773 | 0.099609 | -0.016235 | 0.086914 | 0.102539 | 0.294922 | -0.025024 | 0.353516 | -0.021118 | 0.236328 | -0.159180 | 0.033447 | -0.236328 | -0.172852 | 0.324219 | -0.216797 | -0.063965 | 0.203125 | -0.166016 | 0.074219 | 0.156250 | 0.116699 | -0.239258 | -0.210938 | -0.211914 | -0.188477 | -0.041016 | 0.206055 | 0.079102 | -0.112305 | -0.062500 | -0.178711 | -0.017212 | 0.104980 | -0.333984 | 0.265625 | -0.075195 | -0.155273 | -0.034424 | -0.039551 | 0.156250 | 0.279297 | 0.073242 | 0.178711 | -0.404297 | -0.075684 | 0.205078 | -0.003677 | -0.061035 | -0.065430 | 0.048096 | 0.400391 | -0.197266 | 0.002792 | 0.225586 | -0.073730 | 0.187500 | -0.047607 | -0.172852 | -0.104004 | 0.076660 | 0.014893 | 0.063965 | 0.048340 | -0.138672 | 0.086914 | 0.015869 |
from sklearn.cluster import KMeans
n_clusters = 12
km = KMeans(n_clusters=n_clusters, random_state=42, n_jobs=4)
kmeans = km.fit(df_labels)
centers = kmeans.cluster_centers_
centers
array([[ 0.19308307, 0.1452746 , 0.00185023, ..., -0.05410911, 0.1357924 , 0.01465854], [-0.08875275, -0.00244581, 0.05723538, ..., -0.0517799 , -0.04985809, 0.12571148], [ 0.05526293, 0.10658404, 0.04825819, ..., -0.05685043, 0.06840442, 0.04145173], ..., [ 0.12212906, 0.1023327 , -0.00861059, ..., -0.04945068, 0.20849831, 0.08963013], [ 0.01238569, -0.00396772, 0.00098611, ..., -0.01999324, 0.04802599, 0.06364357], [ 0.13971718, -0.01981354, 0.03617838, ..., -0.00106684, 0.10897891, 0.07919863]])
np.concatenate((df_labels.values,centers),axis=0).shape
(884, 300)
from sklearn.manifold import TSNE
label_embedded = TSNE(n_components=2,perplexity=100).fit_transform(np.concatenate((centers,df_labels.values),axis=0))
label_embedded
array([[ 0.17776407, -0.51662195], [ 1.5100814 , 2.6683402 ], [-0.22348976, 1.4615749 ], ..., [ 2.133301 , 0.50700027], [-0.02977649, 0.45576498], [ 1.4450217 , -0.24538113]], dtype=float32)
import matplotlib.cm as cm
import matplotlib.pyplot as plt
colors = cm.gist_rainbow(np.linspace(0, 1, n_clusters))
plt.style.use('ggplot')
X = label_embedded[:,0]
Y = label_embedded[:,1]
c = kmeans.labels_
plt.figure(figsize=[10,10])
for i in range(len(X)):
if i >= n_clusters:
plt.scatter(X[i],Y[i],color=colors[c[i-n_clusters]])
else:
plt.scatter(X[i],Y[i],color=colors[i],s=500,label=str(i))
plt.legend()
<matplotlib.legend.Legend at 0x7f81f8049d30>
df_labels['C'] = kmeans.labels_
df_labels.C.value_counts()
10 133 2 104 1 90 7 76 0 74 11 72 6 61 3 61 8 58 9 55 4 46 5 42 Name: C, dtype: int64