attribute_name's word vector

From: https://www.kaggle.com/takamichitoda/attribute-name-s-word-vector

Author: Takamichi Toda

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
['fasttext-english-word-vectors-including-subwords', 'imet-2019-fgvc6']
In [2]:
from tqdm import tqdm

fasttext_path = "../input/fasttext-english-word-vectors-including-subwords/wiki-news-300d-1M-subword.vec"

def load_vecs(word, *arr):
    return (word, np.asarray(arr, dtype='float32'))

vec_dic = dict(load_vecs(*line.rstrip().rsplit(' ')) for line in tqdm(open(fasttext_path)))
1000000it [01:32, 10866.56it/s]
In [3]:
labels = pd.read_csv("../input/imet-2019-fgvc6/labels.csv")
labels.sample(5, random_state=42)
Out[3]:
attribute_id attribute_name
309 309 culture::saint-cloud
1041 1041 tag::towns
381 381 culture::verona
497 497 tag::buddha
551 551 tag::clocks
In [4]:
labels["type"] = labels["attribute_name"].map(lambda x: x.split("::")[0])
labels["name"] = labels["attribute_name"].map(lambda x: x.split("::")[1])
labels.sample(5, random_state=42)
Out[4]:
attribute_id attribute_name type name
309 309 culture::saint-cloud culture saint-cloud
1041 1041 tag::towns tag towns
381 381 culture::verona culture verona
497 497 tag::buddha tag buddha
551 551 tag::clocks tag clocks
In [5]:
tag_names = labels[labels["type"] == "tag"]["name"].values
culture_names = labels[labels["type"] == "culture"]["name"].values
In [6]:
def get_vec(w):
    try:
        return vec_dic[w]
    except KeyError:
        return np.zeros(300)
    
tag_vecs = []
for n in tag_names:
    vecs = [get_vec(w) for w in n.split()]
    vec = sum(vecs)/len(vecs)
    tag_vecs.append(vec)
tag_vecs = np.array(tag_vecs)

culture_vecs = []
for n in culture_names:
    vecs = [get_vec(w) for w in n.split()]
    vec = sum(vecs)/len(vecs)
    culture_vecs.append(vec)
culture_vecs = np.array(culture_vecs)
In [7]:
from sklearn.manifold import TSNE

tag_model = TSNE(n_components=2, random_state=42)
np.set_printoptions(suppress=True)
tag_model.fit_transform(tag_vecs)

culture_model = TSNE(n_components=2, random_state=42)
np.set_printoptions(suppress=True)
culture_model.fit_transform(culture_vecs)
Out[7]:
array([[  0.28423035,  -6.649542  ],
       [  3.4337585 ,  -0.98475623],
       [  6.58496   ,   1.0231379 ],
       [ 10.636925  ,   2.1557796 ],
       [  3.906092  ,   5.027689  ],
       [  3.888501  ,   4.8814664 ],
       [  3.6921833 ,   4.714589  ],
       [  4.1385503 ,   5.035313  ],
       [  3.7370276 ,   4.650681  ],
       [  3.463585  ,  -0.751787  ],
       [ -6.473607  ,  -6.1827707 ],
       [  5.3205385 ,   1.7229557 ],
       [ -6.5254927 ,  -7.213172  ],
       [  7.474705  ,   5.6832423 ],
       [  5.767618  ,   5.3494363 ],
       [ -2.3583748 ,   1.376164  ],
       [ -3.319958  ,  -5.262825  ],
       [ -2.160755  ,   1.3292158 ],
       [ -2.0019693 ,  -6.6520333 ],
       [  5.2463326 ,   1.0745995 ],
       [  1.1886401 ,   1.2794417 ],
       [ -3.317036  ,  -7.323565  ],
       [  8.520397  ,   1.2864101 ],
       [  4.6427097 ,   0.3236351 ],
       [  0.4808778 ,   5.485854  ],
       [  1.6576103 ,   4.9309015 ],
       [ -3.6095898 ,  -5.9523926 ],
       [ -1.3548615 ,  -2.6424656 ],
       [ -0.25954008,  -1.7527877 ],
       [  5.7750936 ,   3.5926235 ],
       [ -1.0502791 ,  -5.108099  ],
       [ -4.1231613 ,   2.198848  ],
       [  3.808557  ,  -1.8519245 ],
       [  3.8490973 ,  -0.1214932 ],
       [  2.640492  ,   1.68397   ],
       [ -0.04749239,  -2.0593696 ],
       [  3.8161619 ,   0.871986  ],
       [  0.19810152,   1.4409043 ],
       [ -4.959411  ,  -4.357667  ],
       [ -2.9264715 ,  -9.385999  ],
       [ -0.5929508 ,  -6.1488147 ],
       [  4.7686787 ,   6.373162  ],
       [ -1.1936778 ,   3.7276168 ],
       [ -3.562984  ,   3.5177124 ],
       [ -2.9010694 ,  -6.7550077 ],
       [  2.0815773 ,   0.49240813],
       [ -1.0672711 ,   1.0536165 ],
       [ -2.4841168 ,   0.5637619 ],
       [  2.818948  ,   3.9286008 ],
       [ -2.6360881 ,  -5.4900618 ],
       [ -3.2907693 ,   3.9951963 ],
       [  6.768983  ,   6.196547  ],
       [  5.5317163 ,   5.757345  ],
       [  6.0292044 ,   5.931406  ],
       [ -2.5302858 ,   1.1558292 ],
       [ -0.8244637 ,   2.470702  ],
       [  8.22289   ,  -1.4896351 ],
       [ -3.0903149 ,  -7.1924725 ],
       [ -0.02573751,  -2.5121696 ],
       [ -1.9732729 ,  -6.7242937 ],
       [  8.473351  ,  -1.3343337 ],
       [ -1.9834732 ,  -6.608582  ],
       [  3.5406835 ,  -0.31572595],
       [ -2.5393965 ,  -6.541403  ],
       [ -0.18066728,  -0.12536928],
       [  1.1677067 ,   2.8960903 ],
       [  0.65186405,   1.8489292 ],
       [  5.606357  ,   2.6251516 ],
       [ -3.3129354 ,  -7.040898  ],
       [  8.437757  ,   1.225085  ],
       [  6.783384  ,   2.9010758 ],
       [  7.4217396 ,   1.9606566 ],
       [  6.685169  ,   3.8740122 ],
       [ -2.0636609 ,  -7.080975  ],
       [ -0.9394095 ,  -1.0949852 ],
       [ -3.5773757 ,  -6.8786297 ],
       [ -4.3400836 ,   4.7216134 ],
       [ -2.7279015 ,  -6.943962  ],
       [ -1.9840099 ,  -6.594297  ],
       [  9.595266  ,   0.59201276],
       [  3.519705  ,   4.16021   ],
       [  3.5413475 ,   3.9633358 ],
       [  3.35584   ,   4.1097145 ],
       [ -2.0803142 ,  -7.334229  ],
       [ -2.9846754 ,  -6.900628  ],
       [ -2.7367365 ,  -7.226394  ],
       [ -2.0957847 ,  -7.191443  ],
       [  8.185155  ,   6.3728633 ],
       [  7.662436  ,   5.412745  ],
       [  7.4860787 ,   5.4888687 ],
       [ -2.2848797 ,   2.27044   ],
       [  2.8157656 ,  -0.30139294],
       [  7.774714  ,   0.56645185],
       [  4.83845   ,  -3.0852861 ],
       [  4.7709947 ,  -2.9920766 ],
       [  2.169179  ,   2.6670458 ],
       [  2.1934206 ,   2.776527  ],
       [ -3.4893677 ,  -6.704201  ],
       [ -3.8065412 ,  -6.838275  ],
       [  3.4919407 ,   2.0560994 ],
       [  3.2631986 ,   2.1259995 ],
       [  5.349166  ,   3.4369524 ],
       [  4.6223655 ,   3.7811346 ],
       [  5.8446064 ,  -0.3144905 ],
       [ -2.0853436 ,  -7.311495  ],
       [ -2.1749883 ,  -0.38655087],
       [ -4.499235  ,   3.591615  ],
       [ -2.5391033 ,  -7.3436003 ],
       [ -4.152526  ,   1.9477427 ],
       [ -1.918515  ,   2.875991  ],
       [ -3.0547884 ,   4.07718   ],
       [  4.3890066 ,   6.449248  ],
       [ -2.278692  ,  -7.634931  ],
       [  8.427747  ,   3.7492409 ],
       [  7.9573436 ,   0.73424643],
       [  6.944306  ,   2.9377918 ],
       [  7.039512  ,   2.460665  ],
       [  5.590624  ,   4.634189  ],
       [ -3.595289  ,   3.3237236 ],
       [ -2.8755033 ,  -7.420247  ],
       [  5.5603967 ,   9.435794  ],
       [  5.582008  ,   9.137235  ],
       [ -4.002877  ,  -6.941772  ],
       [ 10.153061  ,   3.963696  ],
       [ -3.2012231 ,  -8.049549  ],
       [  2.7836578 ,  -1.61689   ],
       [ -2.6488366 ,  -7.8849583 ],
       [  6.7571125 ,   3.0801094 ],
       [  3.389649  ,   4.1558323 ],
       [ -2.6598325 ,  -7.8969145 ],
       [ -4.385247  ,  -7.5272894 ],
       [  4.466883  ,   5.874856  ],
       [  5.295967  ,   5.491887  ],
       [ -0.56026745,   3.023841  ],
       [  2.5493956 ,   6.464499  ],
       [  2.5710888 ,   6.4343667 ],
       [  2.4836626 ,   6.4830713 ],
       [  2.5524783 ,   6.4681516 ],
       [  2.5440588 ,   6.4545546 ],
       [  2.5867646 ,   6.4382834 ],
       [  2.5298045 ,   6.3818984 ],
       [  2.5537722 ,   6.439446  ],
       [  2.576377  ,   6.4104457 ],
       [  2.5577128 ,   6.4359727 ],
       [ -2.99366   ,  -8.005136  ],
       [  3.0178204 ,   0.614008  ],
       [ -8.181392  ,  -5.728982  ],
       [  5.056964  ,   6.2969236 ],
       [  5.3483915 ,   5.775726  ],
       [  5.443621  ,   5.630147  ],
       [  5.324191  ,   6.103396  ],
       [ -2.9432015 ,  -7.996961  ],
       [ -3.9744391 ,  -7.874736  ],
       [  1.7975188 ,   2.0902452 ],
       [ -1.489239  ,   2.0316794 ],
       [ -0.25952876,   0.22863975],
       [  5.9131293 ,   4.190448  ],
       [  5.3479304 ,   6.0610304 ],
       [ -3.6066341 ,  -8.034933  ],
       [ -3.6708748 ,  -8.04161   ],
       [ -3.3597112 ,  -8.050183  ],
       [  8.715019  ,   4.411202  ],
       [  8.615336  ,   4.2952046 ],
       [  5.770435  ,   7.5532875 ],
       [ -3.8405013 ,  -7.8741117 ],
       [ -4.207675  ,  -7.7476606 ],
       [  4.799607  ,  -1.3553778 ],
       [  0.6751858 ,  -0.4048471 ],
       [ -3.1705902 ,  -6.2172117 ],
       [ -3.9287117 ,  -7.258784  ],
       [ -3.1625693 ,  -6.228419  ],
       [  2.223227  ,   3.7565618 ],
       [  3.0209577 ,   3.1187904 ],
       [ -3.830234  ,  -5.393314  ],
       [ -3.820852  ,  -5.8025074 ],
       [  4.9992123 ,   3.0639417 ],
       [  4.850321  ,   2.783912  ],
       [  0.5316897 ,   0.02781353],
       [  0.8163349 ,  -1.2919081 ],
       [  4.074511  ,  -2.5950222 ],
       [  9.583752  ,   2.104148  ],
       [  5.7901187 ,   5.1435404 ],
       [  8.453797  ,  -0.99508786],
       [  3.8172045 ,   3.0876179 ],
       [ 10.392559  ,   1.8639275 ],
       [  6.843153  ,   6.6396    ],
       [ -4.7059255 ,  -7.286228  ],
       [  2.624756  ,  -1.0414939 ],
       [  5.4889526 ,   0.99607295],
       [  6.766441  ,   4.394032  ],
       [  5.5378366 ,   5.359208  ],
       [  5.7017093 ,   5.4678025 ],
       [  5.8419857 ,   7.661859  ],
       [ -3.8440921 ,  -5.3868594 ],
       [ 10.105099  ,  -0.18240501],
       [  3.9461772 ,   0.7536425 ],
       [ -4.413209  ,  -5.8874335 ],
       [  7.6797953 ,   0.2842236 ],
       [  2.0767102 ,  -0.9052446 ],
       [ -3.9594636 ,  -5.4204707 ],
       [ -3.2425697 ,  -6.178654  ],
       [ -4.2367797 ,  -5.6026654 ],
       [ 10.054718  ,  -0.14645138],
       [ -0.88915396,  -0.1450653 ],
       [ -3.7066386 ,  -6.2292776 ],
       [ -3.6374495 ,  -6.0052695 ],
       [ -3.4594617 ,   2.4600332 ],
       [ -4.2345014 ,  -5.5934815 ],
       [ -4.4770994 ,  -5.91783   ],
       [ -1.6951438 ,   0.4249166 ],
       [ -3.9043286 ,  -6.27861   ],
       [ -4.2867794 ,   4.5970078 ],
       [ -2.865316  ,   4.6457086 ],
       [ -2.9093132 ,   4.3974595 ],
       [ -0.8648768 ,  -2.4614303 ],
       [ -3.9373596 ,  -6.309779  ],
       [ -4.506344  ,  -6.0219297 ],
       [  2.709575  ,  -4.745315  ],
       [-10.343428  ,   9.396051  ],
       [  0.80608094,   0.5239644 ],
       [ -3.983665  ,  -6.658218  ],
       [  4.3774133 ,   1.818737  ],
       [  0.33742386,   2.6819465 ],
       [ -4.532634  ,  -6.0987825 ],
       [  1.4009203 ,  -0.20911108],
       [ -4.64603   ,  -6.4367914 ],
       [  4.048834  ,  -2.4684927 ],
       [ -4.0731697 ,  -6.7454357 ],
       [  3.2564526 ,   4.0383573 ],
       [ -4.522603  ,  -6.7503643 ],
       [  2.2384841 ,   1.5800599 ],
       [  8.062226  ,   6.2629285 ],
       [ -4.681645  ,  -6.5165353 ],
       [ -4.698122  ,  -6.4604373 ],
       [  7.16226   ,   8.281689  ],
       [ -4.692712  ,  -6.921371  ],
       [  3.555629  ,   0.32225528],
       [ -4.7057776 ,  -6.8906813 ],
       [ -4.503094  ,  -7.287298  ],
       [ -0.8549162 ,  -0.2179615 ],
       [ -3.54943   ,  -7.490671  ],
       [ -4.4022474 ,  -7.4665604 ],
       [  5.0550075 ,  -3.1417243 ],
       [  5.340993  ,   1.8239356 ],
       [ -4.1914334 ,  -7.790666  ],
       [  5.082689  ,  -1.0381842 ],
       [ -2.4728293 ,  -7.7868705 ],
       [ -1.674147  ,   3.2313194 ],
       [ -3.2618587 ,  -7.6226664 ],
       [ -3.8160853 ,  -7.36062   ],
       [ -2.6640997 ,  -6.535986  ],
       [ -2.606799  ,  -6.5717335 ],
       [ -0.57148975,   2.578121  ],
       [ -2.872808  ,  -6.455635  ],
       [ -3.4302301 ,  -6.9181213 ],
       [  0.13632502,  -5.7060266 ],
       [  0.09623385,  -7.6081953 ],
       [  3.6238115 ,   0.6824755 ],
       [  9.563455  ,   2.0630805 ],
       [ -0.78425074,  -7.7704597 ],
       [ -8.181595  ,  -5.7286453 ],
       [  0.81170285,  -3.1363053 ],
       [ -0.5316455 ,  -6.63054   ],
       [  9.469518  ,   0.6646408 ],
       [  8.509765  ,   2.6661544 ],
       [  7.5921016 ,   3.6352782 ],
       [  7.8742747 ,   0.6588182 ],
       [  6.8690515 ,   2.985314  ],
       [  9.344767  ,   2.0959089 ],
       [  6.8956447 ,   3.8256485 ],
       [  9.45875   ,   0.641654  ],
       [  8.120051  ,   0.9689106 ],
       [  4.5563903 ,   3.6163218 ],
       [ -3.274574  ,  -1.6137227 ],
       [ -5.1217713 ,  -3.48914   ],
       [  3.4226115 ,   4.275275  ],
       [ -2.144584  ,  -5.8693438 ],
       [ -3.3899605 ,   1.2014669 ],
       [ -6.089215  ,  -6.6644855 ],
       [ -0.75035006,  -5.5918064 ],
       [ 10.277746  ,   1.9865019 ],
       [ -0.23877507,  -0.49615198],
       [ -2.6055238 ,  -4.007356  ],
       [ -0.82763094,   3.6915452 ],
       [ -4.314496  ,  -3.2997477 ],
       [  3.7776985 ,  -0.507556  ],
       [  2.5814748 ,   1.6627333 ],
       [  8.203982  ,   6.4114594 ],
       [ -0.4518331 ,  -8.450321  ],
       [  2.5550594 ,  -1.1819118 ],
       [  2.763713  ,  -4.6776404 ],
       [  7.321605  ,   1.4861965 ],
       [  2.9382195 ,   5.0530825 ],
       [ -2.4096653 ,  -5.6811585 ],
       [  6.5229793 ,   5.32796   ],
       [ -1.5819222 ,   1.8935817 ],
       [ -2.810885  ,  -5.999563  ],
       [ -2.3879685 ,  -5.6092277 ],
       [  6.739581  ,   2.18685   ],
       [  1.7714599 ,  -3.4175851 ],
       [ -2.1012332 ,  -6.033421  ],
       [  2.9924552 ,  -1.727697  ],
       [ -2.8057935 ,  -5.427577  ],
       [ -3.1648564 ,  -5.7847233 ],
       [  5.6264477 ,   8.750496  ],
       [  5.618509  ,   8.822239  ],
       [  5.629651  ,   8.418702  ],
       [ -3.2812102 ,  -5.291537  ],
       [  4.385864  ,   4.564092  ],
       [ -2.092524  ,  -6.1007614 ],
       [ -2.8106203 ,  -5.384197  ],
       [ -2.720718  ,  -6.016743  ],
       [  1.1998509 ,  -2.2340357 ],
       [ -2.0960145 ,  -5.9982696 ],
       [ -1.1904598 ,  -0.9581867 ],
       [  0.03185305,   1.4853476 ],
       [  4.260306  ,   3.2063284 ],
       [ -2.8959684 ,  -0.38755408],
       [  6.6238875 ,   6.4776983 ],
       [  3.7726321 ,  -0.7508863 ],
       [ -2.8155181 ,  -5.3854585 ],
       [ -1.7022835 ,   0.91199386],
       [ -3.3370602 ,  -5.263254  ],
       [ -3.7516053 ,   3.5688932 ],
       [  9.382737  ,   5.5760055 ],
       [ -3.3388028 ,  -5.267311  ],
       [  0.35629377,   1.1788176 ],
       [ -3.853849  ,  -9.364736  ],
       [ -5.406179  ,  -4.767619  ],
       [  1.8461303 ,  -1.9000263 ],
       [  7.6229987 ,   3.501211  ],
       [  7.6019397 ,   3.6396039 ],
       [  7.874691  ,   0.6735904 ],
       [ -5.9854064 ,  -5.823237  ],
       [  6.7165146 ,   3.654142  ],
       [  6.2871246 ,   5.487713  ],
       [ -1.137398  ,  -8.419927  ],
       [  5.3632812 ,  -1.3577485 ],
       [ -3.3206704 ,   3.379782  ],
       [ -3.6282716 ,   2.1355765 ],
       [ -2.2726262 ,   2.232069  ],
       [ -6.3597345 ,  -5.024985  ],
       [ -5.730423  ,  -5.2472367 ],
       [  1.794038  ,  -0.92911035],
       [  1.9084928 ,  -1.0496347 ],
       [  4.1221366 ,   0.02973941],
       [ -3.7405875 ,   3.6398087 ],
       [  4.623503  ,   3.6963916 ],
       [  5.324186  ,   6.6483984 ],
       [  5.0813646 ,   0.6079031 ],
       [ -5.9572473 ,  -7.570661  ],
       [ -2.1503978 ,  -9.257671  ],
       [ -1.6390585 ,  -8.897703  ],
       [ -1.1815039 ,  -9.337881  ],
       [  8.425808  ,  -1.355962  ],
       [  2.5404122 ,  -2.3634849 ],
       [  2.929902  ,   5.48077   ],
       [ -4.784057  ,  -9.087957  ],
       [ -5.4157023 ,  -9.052374  ],
       [  7.226141  ,  -2.5516493 ],
       [  7.1650167 ,  -2.5636075 ],
       [ -6.1835837 ,  -8.204166  ],
       [ -3.4585133 ,  -9.847988  ],
       [ -4.454839  ,  -9.682     ],
       [ -2.4141858 ,  -9.902653  ],
       [ -5.568454  ,  -8.427809  ],
       [ -0.56410116,  -7.200191  ],
       [ -0.41545838,   2.9841766 ],
       [  5.123544  ,   4.584803  ],
       [  5.1702013 ,   5.1501055 ],
       [ -3.7831116 ,  -3.9439888 ],
       [ -5.8174562 ,  -4.096374  ],
       [  2.5058575 ,   4.564542  ],
       [  1.327482  ,   4.190374  ],
       [ -1.4502038 ,  -4.5618134 ],
       [ -0.3027569 ,  -4.9114256 ],
       [ -0.61584103,  -1.3884171 ],
       [  0.2509382 ,   2.219613  ],
       [  4.0380573 ,   7.4949236 ],
       [ -0.9073974 ,  -4.350362  ],
       [ -4.3808084 ,  -4.1028576 ],
       [  0.01772884,  -0.2859563 ],
       [ -1.2794785 ,   1.8759428 ],
       [ -0.37010437,   3.6627462 ],
       [  8.70783   ,  -1.3625113 ],
       [ -2.6806512 ,  -3.365781  ],
       [ -3.5187652 ,  -3.181396  ],
       [ -1.9004717 ,  -4.3148975 ],
       [  3.0001779 ,   0.49040544],
       [ -3.1530912 ,  -3.865217  ],
       [  1.2122366 ,  -0.29572207],
       [  8.2288065 ,   3.3759832 ],
       [  6.8743486 ,   2.978995  ],
       [ -3.586135  ,   2.0957088 ],
       [ -1.8827164 ,  -3.8030055 ],
       [ -3.41658   ,  -6.4262433 ],
       [  3.7913375 ,  -0.7227018 ],
       [ -0.06317055,   1.4508319 ]], dtype=float32)
In [8]:
import matplotlib.pyplot as plt 

tag

In [9]:
plt.figure(figsize=(40,40))
plt.scatter(tag_model.embedding_[:, 0], tag_model.embedding_[:,1])

count = 0
for label, x, y in zip(tag_names, tag_model.embedding_[:, 0], tag_model.embedding_[:, 1]):
    count +=1
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
plt.show()

culture

In [10]:
plt.figure(figsize=(40,40))
plt.scatter(culture_model.embedding_[:, 0], culture_model.embedding_[:,1])

count = 0
for label, x, y in zip(culture_names, culture_model.embedding_[:, 0], culture_model.embedding_[:, 1]):
    count +=1
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
plt.show()